diff options
Diffstat (limited to 'arch/x86')
163 files changed, 4005 insertions, 1184 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4f393eb9745f..c55870ac907e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1853,6 +1853,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS If unsure, say y. +choice + prompt "TSX enable mode" + depends on CPU_SUP_INTEL + default X86_INTEL_TSX_MODE_OFF + help + Intel's TSX (Transactional Synchronization Extensions) feature + allows to optimize locking protocols through lock elision which + can lead to a noticeable performance boost. + + On the other hand it has been shown that TSX can be exploited + to form side channel attacks (e.g. TAA) and chances are there + will be more of those attacks discovered in the future. + + Therefore TSX is not enabled by default (aka tsx=off). An admin + might override this decision by tsx=on the command line parameter. + Even with TSX enabled, the kernel will attempt to enable the best + possible TAA mitigation setting depending on the microcode available + for the particular machine. + + This option allows to set the default tsx mode between tsx=on, =off + and =auto. See Documentation/admin-guide/kernel-parameters.txt for more + details. + + Say off if not sure, auto if TSX is in use but it should be used on safe + platforms or on if TSX is in use and the security aspect of tsx is not + relevant. + +config X86_INTEL_TSX_MODE_OFF + bool "off" + help + TSX is disabled if possible - equals to tsx=off command line parameter. + +config X86_INTEL_TSX_MODE_ON + bool "on" + help + TSX is always enabled on TSX capable HW - equals the tsx=on command + line parameter. + +config X86_INTEL_TSX_MODE_AUTO + bool "auto" + help + TSX is enabled on TSX capable HW that is believed to be safe against + side channel attacks- equals the tsx=auto command line parameter. +endchoice + config EFI bool "EFI runtime service support" depends on ACPI @@ -2139,14 +2184,8 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING If unsure, leave at the default value. config HOTPLUG_CPU - bool "Support for hot-pluggable CPUs" + def_bool y depends on SMP - ---help--- - Say Y here to allow turning CPUs off and on. CPUs can be - controlled through /sys/devices/system/cpu. - ( Note: power management support will enable this option - automatically on SMP systems. ) - Say N if you want to disable CPU hotplug. config BOOTPARAM_HOTPLUG_CPU0 bool "Set default setting of cpu0_hotpluggable" @@ -2677,8 +2716,7 @@ config OLPC config OLPC_XO1_PM bool "OLPC XO-1 Power Management" - depends on OLPC && MFD_CS5535 && PM_SLEEP - select MFD_CORE + depends on OLPC && MFD_CS5535=y && PM_SLEEP ---help--- Add support for poweroff and suspend of the OLPC XO-1 laptop. diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 6293a8768a91..bec0952c5595 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -189,7 +189,7 @@ config HAVE_MMIOTRACE_SUPPORT config X86_DECODER_SELFTEST bool "x86 instruction decoder selftest" - depends on DEBUG_KERNEL && KPROBES + depends on DEBUG_KERNEL && INSTRUCTION_DECODER depends on !COMPILE_TEST ---help--- Perform x86 instruction decoder selftests at build time. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c5290aecdf06..3dc54d2f79c4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -39,6 +39,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) export REALMODE_CFLAGS @@ -48,7 +49,7 @@ export REALMODE_CFLAGS export BITS ifdef CONFIG_X86_NEED_RELOCS - LDFLAGS_vmlinux := --emit-relocs + LDFLAGS_vmlinux := --emit-relocs --discard-none endif # @@ -242,6 +243,15 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # Avoid indirect branches in kernel to deal with Spectre ifdef CONFIG_RETPOLINE KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) + # Additionally, avoid generating expensive indirect jumps which + # are subject to retpolines for small number of switch cases. + # clang turns off jump table generation by default when under + # retpoline builds, however, gcc does not for x86. This has + # only been fixed starting from gcc stable version 8.4.0 and + # onwards, but not for older ones. See gcc bug #86952. + ifneq ($(cc-name), clang) + KBUILD_CFLAGS += $(call cc-option,-fno-jump-tables) + endif endif archscripts: scripts_basic diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index d88a2fddba8c..1c060748c813 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -100,7 +100,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE AFLAGS_header.o += -I$(objtree)/$(obj) $(obj)/header.o: $(obj)/zoffset.h -LDFLAGS_setup.elf := -T +LDFLAGS_setup.elf := -m elf_i386 -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 4b3d92a37c80..39fdede523f2 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -227,6 +227,11 @@ ENTRY(efi32_stub_entry) leal efi32_config(%ebp), %eax movl %eax, efi_config(%ebp) + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + jmp startup_32 ENDPROC(efi32_stub_entry) #endif diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 252fee320816..fb07cfa3f2f9 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -16,6 +16,7 @@ #include "error.h" #include "../string.h" #include "../voffset.h" +#include <asm/bootparam_utils.h> /* * WARNING!! diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 32d4ec2e0243..5380d45b1c6e 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -19,7 +19,6 @@ #include <asm/page.h> #include <asm/boot.h> #include <asm/bootparam.h> -#include <asm/bootparam_utils.h> #define BOOT_BOOT_H #include "../ctype.h" diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index cd4df9322501..7bbfe7d35da7 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -76,15 +76,14 @@ static int chksum_final(struct shash_desc *desc, u8 *out) return 0; } -static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, - u8 *out) +static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out) { if (irq_fpu_usable()) { kernel_fpu_begin(); - *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); + *(__u16 *)out = crc_t10dif_pcl(crc, data, len); kernel_fpu_end(); } else - *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + *(__u16 *)out = crc_t10dif_generic(crc, data, len); return 0; } @@ -93,15 +92,13 @@ static int chksum_finup(struct shash_desc *desc, const u8 *data, { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - return __chksum_finup(&ctx->crc, data, len, out); + return __chksum_finup(ctx->crc, data, len, out); } static int chksum_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out) { - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup(&ctx->crc, data, length, out); + return __chksum_finup(0, data, length, out); } static struct shash_alg alg = { diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S index 3b6e70d085da..8457cdd47f75 100644 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2) vpaddq t2,t1,t1 vmovq t1x,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S index c88c670cb5fc..5851c7418fb7 100644 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx @@ -520,6 +520,12 @@ ENTRY(poly1305_2block_sse2) paddq t2,t1 movq t1,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -558,16 +564,16 @@ ENTRY(poly1305_2block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 5d10b7a85cad..557c1bdda311 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -332,6 +332,23 @@ For 32-bit we have the following conventions - kernel is built with #endif +/* + * Mitigate Spectre v1 for conditional swapgs code paths. + * + * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to + * prevent a speculative swapgs when coming from kernel space. + * + * FENCE_SWAPGS_KERNEL_ENTRY is used in the kernel entry non-swapgs code path, + * to prevent the swapgs from getting speculatively skipped when coming from + * user space. + */ +.macro FENCE_SWAPGS_USER_ENTRY + ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_USER +.endm +.macro FENCE_SWAPGS_KERNEL_ENTRY + ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL +.endm + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 60e21ccfb6d6..389800344f69 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -31,6 +31,7 @@ #include <asm/vdso.h> #include <linux/uaccess.h> #include <asm/cpufeature.h> +#include <asm/nospec-branch.h> #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> @@ -213,6 +214,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) #endif user_enter_irqoff(); + + mds_user_clear_cpu_buffers(); } #define SYSCALL_EXIT_WORK_FLAGS \ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 60c4c342316c..49adabd94f88 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -234,6 +234,7 @@ ENTRY(__switch_to_asm) pushl %ebx pushl %edi pushl %esi + pushfl /* switch stack */ movl %esp, TASK_threadsp(%eax) @@ -256,6 +257,7 @@ ENTRY(__switch_to_asm) #endif /* restore callee-saved registers */ + popfl popl %esi popl %edi popl %ebx diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 164cd7529f0b..5ec66fafde4e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -342,6 +342,7 @@ ENTRY(__switch_to_asm) pushq %r13 pushq %r14 pushq %r15 + pushfq /* switch stack */ movq %rsp, TASK_threadsp(%rdi) @@ -364,6 +365,7 @@ ENTRY(__switch_to_asm) #endif /* restore callee-saved registers */ + popfq popq %r15 popq %r14 popq %r13 @@ -529,9 +531,12 @@ END(irq_entries_start) testb $3, CS-ORIG_RAX(%rsp) jz 1f SWAPGS + FENCE_SWAPGS_USER_ENTRY call switch_to_thread_stack + jmp 2f 1: - + FENCE_SWAPGS_KERNEL_ENTRY +2: PUSH_AND_CLEAR_REGS ENCODE_FRAME_POINTER @@ -851,7 +856,7 @@ ENTRY(switch_to_thread_stack) ret END(switch_to_thread_stack) -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 create_gap=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -871,6 +876,20 @@ ENTRY(\sym) jnz .Lfrom_usermode_switch_stack_\@ .endif + .if \create_gap == 1 + /* + * If coming from kernel space, create a 6-word gap to allow the + * int3 handler to emulate a call instruction. + */ + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_no_gap_\@ + .rept 6 + pushq 5*8(%rsp) + .endr + UNWIND_HINT_IRET_REGS offset=8 +.Lfrom_usermode_no_gap_\@: + .endif + .if \paranoid call paranoid_entry .else @@ -1091,13 +1110,12 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ #endif /* CONFIG_HYPERV */ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 +idtentry int3 do_int3 has_error_code=0 create_gap=1 idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN idtentry xennmi do_nmi has_error_code=0 idtentry xendebug do_debug has_error_code=0 -idtentry xenint3 do_int3 has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 @@ -1131,6 +1149,12 @@ ENTRY(paranoid_entry) 1: SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + /* + * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an + * unconditional CR3 write, even in the PTI case. So do an lfence + * to prevent GS speculation, regardless of whether PTI is enabled. + */ + FENCE_SWAPGS_KERNEL_ENTRY ret END(paranoid_entry) @@ -1180,6 +1204,7 @@ ENTRY(error_entry) * from user mode due to an IRET fault. */ SWAPGS + FENCE_SWAPGS_USER_ENTRY /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax @@ -1201,6 +1226,8 @@ ENTRY(error_entry) CALL_enter_from_user_mode ret +.Lerror_entry_done_lfence: + FENCE_SWAPGS_KERNEL_ENTRY .Lerror_entry_done: TRACE_IRQS_OFF ret @@ -1219,7 +1246,7 @@ ENTRY(error_entry) cmpq %rax, RIP+8(%rsp) je .Lbstep_iret cmpq $.Lgs_change, RIP+8(%rsp) - jne .Lerror_entry_done + jne .Lerror_entry_done_lfence /* * hack: .Lgs_change can fail with user gsbase. If this happens, fix up @@ -1227,6 +1254,7 @@ ENTRY(error_entry) * .Lgs_change's error handler with kernel gsbase. */ SWAPGS + FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax jmp .Lerror_entry_done @@ -1241,6 +1269,7 @@ ENTRY(error_entry) * gsbase and CR3. Switch to kernel gsbase and CR3: */ SWAPGS + FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax /* @@ -1332,6 +1361,7 @@ ENTRY(nmi) swapgs cld + FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 0a550dc5c525..ab7f730cf7f2 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -48,10 +48,8 @@ targets += $(vdso_img_sodbg) export CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ - -Wl,--no-undefined \ - -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \ - $(DISABLE_LTO) +VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \ + -z max-page-size=4096 $(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) @@ -103,10 +101,8 @@ CFLAGS_REMOVE_vvar.o = -pg # CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ - -Wl,-soname=linux-vdso.so.1 \ - -Wl,-z,max-page-size=4096 \ - -Wl,-z,common-page-size=4096 +VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \ + -z max-page-size=4096 # 64-bit objects to re-brand as x32 vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y)) @@ -134,7 +130,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 +VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1 # This makes sure the $(obj) subdirectory exists even though vdso32/ # is not a kbuild sub-make subdirectory. @@ -180,13 +176,14 @@ $(obj)/vdso32.so.dbg: FORCE \ # The DSO images are built using a special linker script. # quiet_cmd_vdso = VDSO $@ - cmd_vdso = $(CC) -nostdlib -o $@ \ + cmd_vdso = $(LD) -nostdlib -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ + -T $(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' -VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \ - $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) +VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \ + $(call ld-option, --build-id) $(call ld-option, --eh-frame-hdr) \ + -Bsymbolic GCOV_PROFILE := n # diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 9c35dc0a9d64..9f4b1081dee0 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -29,12 +29,12 @@ extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); #ifdef CONFIG_PARAVIRT_CLOCK -extern u8 pvclock_page +extern u8 pvclock_page[PAGE_SIZE] __attribute__((visibility("hidden"))); #endif #ifdef CONFIG_HYPERV_TSCPAGE -extern u8 hvclock_page +extern u8 hvclock_page[PAGE_SIZE] __attribute__((visibility("hidden"))); #endif @@ -191,13 +191,24 @@ notrace static inline u64 vgetsns(int *mode) if (gtod->vclock_mode == VCLOCK_TSC) cycles = vread_tsc(); + + /* + * For any memory-mapped vclock type, we need to make sure that gcc + * doesn't cleverly hoist a load before the mode check. Otherwise we + * might end up touching the memory-mapped page even if the vclock in + * question isn't enabled, which will segfault. Hence the barriers. + */ #ifdef CONFIG_PARAVIRT_CLOCK - else if (gtod->vclock_mode == VCLOCK_PVCLOCK) + else if (gtod->vclock_mode == VCLOCK_PVCLOCK) { + barrier(); cycles = vread_pvclock(mode); + } #endif #ifdef CONFIG_HYPERV_TSCPAGE - else if (gtod->vclock_mode == VCLOCK_HVCLOCK) + else if (gtod->vclock_mode == VCLOCK_HVCLOCK) { + barrier(); cycles = vread_hvclock(mode); + } #endif else return 0; diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index c84584bb9402..defb536aebce 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -3,10 +3,16 @@ #include <linux/types.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/delay.h> +#include <linux/jiffies.h> #include <asm/apicdef.h> +#include <asm/nmi.h> #include "../perf_event.h" +static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp); +static unsigned long perf_nmi_window; + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -112,23 +118,144 @@ static __initconst const u64 amd_hw_cache_event_ids }, }; +static __initconst const u64 amd_hw_cache_event_ids_f17h + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0040, /* Data Cache Accesses */ + [C(RESULT_MISS)] = 0xc860, /* L2$ access from DC Miss */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0xff5a, /* h/w prefetch DC Fills */ + [C(RESULT_MISS)] = 0, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0080, /* Instruction cache fetches */ + [C(RESULT_MISS)] = 0x0081, /* Instruction cache misses */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0xff45, /* All L2 DTLB accesses */ + [C(RESULT_MISS)] = 0xf045, /* L2 DTLB misses (PT walks) */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0084, /* L1 ITLB misses, L2 ITLB hits */ + [C(RESULT_MISS)] = 0xff85, /* L1 ITLB misses, L2 misses */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c2, /* Retired Branch Instr. */ + [C(RESULT_MISS)] = 0x00c3, /* Retired Mispredicted BI */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + /* - * AMD Performance Monitor K7 and later. + * AMD Performance Monitor K7 and later, up to and including Family 16h: */ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, - [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ +}; + +/* + * AMD Performance Monitor Family 17h and later: + */ +static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187, }; static u64 amd_pmu_event_map(int hw_event) { + if (boot_cpu_data.x86 >= 0x17) + return amd_f17h_perfmon_event_map[hw_event]; + return amd_perfmon_event_map[hw_event]; } @@ -429,6 +556,131 @@ static void amd_pmu_cpu_dead(int cpu) } } +/* + * When a PMC counter overflows, an NMI is used to process the event and + * reset the counter. NMI latency can result in the counter being updated + * before the NMI can run, which can result in what appear to be spurious + * NMIs. This function is intended to wait for the NMI to run and reset + * the counter to avoid possible unhandled NMI messages. + */ +#define OVERFLOW_WAIT_COUNT 50 + +static void amd_pmu_wait_on_overflow(int idx) +{ + unsigned int i; + u64 counter; + + /* + * Wait for the counter to be reset if it has overflowed. This loop + * should exit very, very quickly, but just in case, don't wait + * forever... + */ + for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { + rdmsrl(x86_pmu_event_addr(idx), counter); + if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) + break; + + /* Might be in IRQ context, so can't sleep */ + udelay(1); + } +} + +static void amd_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx; + + x86_pmu_disable_all(); + + /* + * This shouldn't be called from NMI context, but add a safeguard here + * to return, since if we're in NMI context we can't wait for an NMI + * to reset an overflowed counter value. + */ + if (in_nmi()) + return; + + /* + * Check each counter for overflow and wait for it to be reset by the + * NMI if it has overflowed. This relies on the fact that all active + * counters are always enabled when this function is caled and + * ARCH_PERFMON_EVENTSEL_INT is always set. + */ + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_wait_on_overflow(idx); + } +} + +static void amd_pmu_disable_event(struct perf_event *event) +{ + x86_pmu_disable_event(event); + + /* + * This can be called from NMI context (via x86_pmu_stop). The counter + * may have overflowed, but either way, we'll never see it get reset + * by the NMI if we're already in the NMI. And the NMI latency support + * below will take care of any pending NMI that might have been + * generated by the overflow. + */ + if (in_nmi()) + return; + + amd_pmu_wait_on_overflow(event->hw.idx); +} + +/* + * Because of NMI latency, if multiple PMC counters are active or other sources + * of NMIs are received, the perf NMI handler can handle one or more overflowed + * PMC counters outside of the NMI associated with the PMC overflow. If the NMI + * doesn't arrive at the LAPIC in time to become a pending NMI, then the kernel + * back-to-back NMI support won't be active. This PMC handler needs to take into + * account that this can occur, otherwise this could result in unknown NMI + * messages being issued. Examples of this is PMC overflow while in the NMI + * handler when multiple PMCs are active or PMC overflow while handling some + * other source of an NMI. + * + * Attempt to mitigate this by creating an NMI window in which un-handled NMIs + * received during this window will be claimed. This prevents extending the + * window past when it is possible that latent NMIs should be received. The + * per-CPU perf_nmi_tstamp will be set to the window end time whenever perf has + * handled a counter. When an un-handled NMI is received, it will be claimed + * only if arriving within that window. + */ +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int active, handled; + + /* + * Obtain the active count before calling x86_pmu_handle_irq() since + * it is possible that x86_pmu_handle_irq() may make a counter + * inactive (through x86_pmu_stop). + */ + active = __bitmap_weight(cpuc->active_mask, X86_PMC_IDX_MAX); + + /* Process any counter overflows */ + handled = x86_pmu_handle_irq(regs); + + /* + * If a counter was handled, record a timestamp such that un-handled + * NMIs will be claimed if arriving within that window. + */ + if (handled) { + this_cpu_write(perf_nmi_tstamp, + jiffies + perf_nmi_window); + + return handled; + } + + if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) + return NMI_DONE; + + return NMI_HANDLED; +} + static struct event_constraint * amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -621,11 +873,11 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, + .handle_irq = amd_pmu_handle_irq, + .disable_all = amd_pmu_disable_all, .enable_all = x86_pmu_enable_all, .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, + .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_K7_EVNTSEL0, @@ -657,6 +909,9 @@ static int __init amd_core_pmu_init(void) if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) return 0; + /* Avoid calulating the value each time in the NMI handler */ + perf_nmi_window = msecs_to_jiffies(100); + switch (boot_cpu_data.x86) { case 0x15: pr_cont("Fam15h "); @@ -714,9 +969,10 @@ __init int amd_pmu_init(void) x86_pmu.amd_nb_constraints = 0; } - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + if (boot_cpu_data.x86 >= 0x17) + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids_f17h, sizeof(hw_cache_event_ids)); + else + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); return 0; } @@ -728,7 +984,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - x86_pmu_disable_all(); + amd_pmu_disable_all(); x86_pmu_enable_all(0); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -746,7 +1002,7 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - x86_pmu_disable_all(); + amd_pmu_disable_all(); x86_pmu_enable_all(0); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 8c51844694e2..f24e9adaa316 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -389,7 +389,8 @@ static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 config) { config &= ~perf_ibs->cnt_mask; - wrmsrl(hwc->config_base, config); + if (boot_cpu_data.x86 == 0x10) + wrmsrl(hwc->config_base, config); config &= ~perf_ibs->enable_mask; wrmsrl(hwc->config_base, config); } @@ -564,7 +565,8 @@ static struct perf_ibs perf_ibs_op = { }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, - .cnt_mask = IBS_OP_MAX_CNT, + .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | + IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, .max_period = IBS_OP_MAX_CNT << 4, @@ -625,7 +627,7 @@ fail: if (event->attr.sample_type & PERF_SAMPLE_RAW) offset_max = perf_ibs->offset_max; else if (check_rip) - offset_max = 2; + offset_max = 3; else offset_max = 1; do { @@ -672,10 +674,17 @@ fail: throttle = perf_event_overflow(event, &data, ®s); out: - if (throttle) + if (throttle) { perf_ibs_stop(event, 0); - else - perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + } else { + period >>= 4; + + if ((ibs_caps & IBS_CAPS_RDWROPCNT) && + (*config & IBS_OP_CNT_CTL)) + period |= *config & IBS_OP_CUR_CNT_RAND; + + perf_ibs_enable_event(perf_ibs, hwc, period); + } perf_event_update_userpage(event); diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 4e1d7483b78c..baa7e36073f9 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -19,6 +19,7 @@ #include <asm/cpufeature.h> #include <asm/perf_event.h> #include <asm/msr.h> +#include <asm/smp.h> #define NUM_COUNTERS_NB 4 #define NUM_COUNTERS_L2 4 @@ -209,15 +210,22 @@ static int amd_uncore_event_init(struct perf_event *event) hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; hwc->idx = -1; + if (event->cpu < 0) + return -EINVAL; + /* * SliceMask and ThreadMask need to be set for certain L3 events in * Family 17h. For other events, the two fields do not affect the count. */ - if (l3_mask) - hwc->config |= (AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK); + if (l3_mask && is_llc_event(event)) { + int thread = 2 * (cpu_data(event->cpu).cpu_core_id % 4); - if (event->cpu < 0) - return -EINVAL; + if (smp_num_siblings > 1) + thread += cpu_data(event->cpu).apicid & 1; + + hwc->config |= (1ULL << (AMD64_L3_THREAD_SHIFT + thread) & + AMD64_L3_THREAD_MASK) | AMD64_L3_SLICE_MASK; + } uncore = event_to_amd_uncore(event); if (!uncore) @@ -407,26 +415,8 @@ static int amd_uncore_cpu_starting(unsigned int cpu) } if (amd_uncore_llc) { - unsigned int apicid = cpu_data(cpu).apicid; - unsigned int nshared, subleaf, prev_eax = 0; - uncore = *per_cpu_ptr(amd_uncore_llc, cpu); - /* - * Iterate over Cache Topology Definition leaves until no - * more cache descriptions are available. - */ - for (subleaf = 0; subleaf < 5; subleaf++) { - cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx); - - /* EAX[0:4] gives type of cache */ - if (!(eax & 0x1f)) - break; - - prev_eax = eax; - } - nshared = ((prev_eax >> 14) & 0xfff) + 1; - - uncore->id = apicid - (apicid % nshared); + uncore->id = per_cpu(cpu_llc_id, cpu); uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); *per_cpu_ptr(amd_uncore_llc, cpu) = uncore; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7d12b0d1f359..c1f7b3cb84a9 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -375,7 +375,7 @@ int x86_add_exclusive(unsigned int what) * LBR and BTS are still mutually exclusive. */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) - return 0; + goto out; if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { mutex_lock(&pmc_reserve_mutex); @@ -387,6 +387,7 @@ int x86_add_exclusive(unsigned int what) mutex_unlock(&pmc_reserve_mutex); } +out: atomic_inc(&active_events); return 0; @@ -397,11 +398,15 @@ fail_unlock: void x86_del_exclusive(unsigned int what) { + atomic_dec(&active_events); + + /* + * See the comment in x86_add_exclusive(). + */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return; atomic_dec(&x86_pmu.lbr_exclusive[what]); - atomic_dec(&active_events); } int x86_setup_perfctr(struct perf_event *event) @@ -1328,8 +1333,9 @@ void x86_pmu_stop(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { + if (test_bit(hwc->idx, cpuc->active_mask)) { x86_pmu.disable(event); + __clear_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; @@ -1426,16 +1432,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs) apic_write(APIC_LVTPC, APIC_DM_NMI); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) { - /* - * Though we deactivated the counter some cpus - * might still deliver spurious interrupts still - * in flight. Catch them: - */ - if (__test_and_clear_bit(idx, cpuc->running)) - handled++; + if (!test_bit(idx, cpuc->active_mask)) continue; - } event = cpuc->events[idx]; @@ -1968,7 +1966,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu) */ static void free_fake_cpuc(struct cpu_hw_events *cpuc) { - kfree(cpuc->shared_regs); + intel_cpuc_finish(cpuc); kfree(cpuc); } @@ -1980,14 +1978,11 @@ static struct cpu_hw_events *allocate_fake_cpuc(void) cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); if (!cpuc) return ERR_PTR(-ENOMEM); - - /* only needed, if we have extra_regs */ - if (x86_pmu.extra_regs) { - cpuc->shared_regs = allocate_shared_regs(cpu); - if (!cpuc->shared_regs) - goto error; - } cpuc->is_fake = 1; + + if (intel_cpuc_prepare(cpuc, cpu)) + goto error; + return cpuc; error: free_fake_cpuc(cpuc); @@ -2250,6 +2245,19 @@ void perf_check_microcode(void) x86_pmu.check_microcode(); } +static int x86_pmu_check_period(struct perf_event *event, u64 value) +{ + if (x86_pmu.check_period && x86_pmu.check_period(event, value)) + return -EINVAL; + + if (value && x86_pmu.limit_period) { + if (x86_pmu.limit_period(event, value) > value) + return -EINVAL; + } + + return 0; +} + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, @@ -2274,6 +2282,7 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, .task_ctx_size = sizeof(struct x86_perf_task_context), + .check_period = x86_pmu_check_period, }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 24ffa1e88cf9..4d3399405d06 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -71,9 +71,17 @@ struct bts_buffer { static struct pmu bts_pmu; +static int buf_nr_pages(struct page *page) +{ + if (!PagePrivate(page)) + return 1; + + return 1 << page_private(page); +} + static size_t buf_size(struct page *page) { - return 1 << (PAGE_SHIFT + page_private(page)); + return buf_nr_pages(page) * PAGE_SIZE; } static void * @@ -89,9 +97,7 @@ bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite) /* count all the high order buffers */ for (pg = 0, nbuf = 0; pg < nr_pages;) { page = virt_to_page(pages[pg]); - if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1)) - return NULL; - pg += 1 << page_private(page); + pg += buf_nr_pages(page); nbuf++; } @@ -115,7 +121,7 @@ bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite) unsigned int __nr_pages; page = virt_to_page(pages[pg]); - __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1; + __nr_pages = buf_nr_pages(page); buf->buf[nbuf].page = page; buf->buf[nbuf].offset = offset; buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7bb80151bfff..4a60ed8c4413 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1995,6 +1995,39 @@ static void intel_pmu_nhm_enable_all(int added) intel_pmu_enable_all(added); } +static void intel_set_tfa(struct cpu_hw_events *cpuc, bool on) +{ + u64 val = on ? MSR_TFA_RTM_FORCE_ABORT : 0; + + if (cpuc->tfa_shadow != val) { + cpuc->tfa_shadow = val; + wrmsrl(MSR_TSX_FORCE_ABORT, val); + } +} + +static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) +{ + /* + * We're going to use PMC3, make sure TFA is set before we touch it. + */ + if (cntr == 3 && !cpuc->is_fake) + intel_set_tfa(cpuc, true); +} + +static void intel_tfa_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * If we find PMC3 is no longer used when we enable the PMU, we can + * clear TFA. + */ + if (!test_bit(3, cpuc->active_mask)) + intel_set_tfa(cpuc, false); + + intel_pmu_enable_all(added); +} + static inline u64 intel_pmu_get_status(void) { u64 status; @@ -2640,6 +2673,35 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc) } static struct event_constraint * +dyn_constraint(struct cpu_hw_events *cpuc, struct event_constraint *c, int idx) +{ + WARN_ON_ONCE(!cpuc->constraint_list); + + if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { + struct event_constraint *cx; + + /* + * grab pre-allocated constraint entry + */ + cx = &cpuc->constraint_list[idx]; + + /* + * initialize dynamic constraint + * with static constraint + */ + *cx = *c; + + /* + * mark constraint as dynamic + */ + cx->flags |= PERF_X86_EVENT_DYNAMIC; + c = cx; + } + + return c; +} + +static struct event_constraint * intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, int idx, struct event_constraint *c) { @@ -2669,27 +2731,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * only needed when constraint has not yet * been cloned (marked dynamic) */ - if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { - struct event_constraint *cx; - - /* - * grab pre-allocated constraint entry - */ - cx = &cpuc->constraint_list[idx]; - - /* - * initialize dynamic constraint - * with static constraint - */ - *cx = *c; - - /* - * mark constraint as dynamic, so we - * can free it later on - */ - cx->flags |= PERF_X86_EVENT_DYNAMIC; - c = cx; - } + c = dyn_constraint(cpuc, c, idx); /* * From here on, the constraint is dynamic. @@ -2959,7 +3001,7 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) flags &= ~PERF_SAMPLE_TIME; if (!event->attr.exclude_kernel) flags &= ~PERF_SAMPLE_REGS_USER; - if (event->attr.sample_regs_user & ~PEBS_REGS) + if (event->attr.sample_regs_user & ~PEBS_GP_REGS) flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } @@ -3009,7 +3051,7 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { - if (!event->attr.freq) { + if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & ~intel_pmu_free_running_flags(event))) @@ -3209,6 +3251,26 @@ glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static bool allow_tsx_force_abort = true; + +static struct event_constraint * +tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c = hsw_get_event_constraints(cpuc, idx, event); + + /* + * Without TFA we must not use PMC3. + */ + if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) { + c = dyn_constraint(cpuc, c, idx); + c->idxmsk64 &= ~(1ULL << 3); + c->weight--; + } + + return c; +} + /* * Broadwell: * @@ -3235,6 +3297,11 @@ static u64 bdw_limit_period(struct perf_event *event, u64 left) return left; } +static u64 nhm_limit_period(struct perf_event *event, u64 left) +{ + return max(left, 32ULL); +} + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -3262,7 +3329,7 @@ ssize_t intel_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -struct intel_shared_regs *allocate_shared_regs(int cpu) +static struct intel_shared_regs *allocate_shared_regs(int cpu) { struct intel_shared_regs *regs; int i; @@ -3294,23 +3361,24 @@ static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu) return c; } -static int intel_pmu_cpu_prepare(int cpu) -{ - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); +int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) +{ if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { cpuc->shared_regs = allocate_shared_regs(cpu); if (!cpuc->shared_regs) goto err; } - if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA)) { size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); - cpuc->constraint_list = kzalloc(sz, GFP_KERNEL); + cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); if (!cpuc->constraint_list) goto err_shared_regs; + } + if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { cpuc->excl_cntrs = allocate_excl_cntrs(cpu); if (!cpuc->excl_cntrs) goto err_constraint_list; @@ -3332,6 +3400,11 @@ err: return -ENOMEM; } +static int intel_pmu_cpu_prepare(int cpu) +{ + return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); +} + static void flip_smm_bit(void *data) { unsigned long set = *(unsigned long *)data; @@ -3359,6 +3432,12 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = NULL; + if (x86_pmu.flags & PMU_FL_TFA) { + WARN_ON_ONCE(cpuc->tfa_shadow); + cpuc->tfa_shadow = ~0ULL; + intel_set_tfa(cpuc, false); + } + if (x86_pmu.version > 1) flip_smm_bit(&x86_pmu.attr_freeze_on_smi); @@ -3403,9 +3482,8 @@ static void intel_pmu_cpu_starting(int cpu) } } -static void free_excl_cntrs(int cpu) +static void free_excl_cntrs(struct cpu_hw_events *cpuc) { - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); struct intel_excl_cntrs *c; c = cpuc->excl_cntrs; @@ -3413,14 +3491,19 @@ static void free_excl_cntrs(int cpu) if (c->core_id == -1 || --c->refcnt == 0) kfree(c); cpuc->excl_cntrs = NULL; - kfree(cpuc->constraint_list); - cpuc->constraint_list = NULL; } + + kfree(cpuc->constraint_list); + cpuc->constraint_list = NULL; } static void intel_pmu_cpu_dying(int cpu) { - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + fini_debug_store_on_cpu(cpu); +} + +void intel_cpuc_finish(struct cpu_hw_events *cpuc) +{ struct intel_shared_regs *pc; pc = cpuc->shared_regs; @@ -3430,9 +3513,12 @@ static void intel_pmu_cpu_dying(int cpu) cpuc->shared_regs = NULL; } - free_excl_cntrs(cpu); + free_excl_cntrs(cpuc); +} - fini_debug_store_on_cpu(cpu); +static void intel_pmu_cpu_dead(int cpu) +{ + intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu)); } static void intel_pmu_sched_task(struct perf_event_context *ctx, @@ -3442,6 +3528,11 @@ static void intel_pmu_sched_task(struct perf_event_context *ctx, intel_pmu_lbr_sched_task(ctx, sched_in); } +static int intel_pmu_check_period(struct perf_event *event, u64 value) +{ + return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0; +} + PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -3521,6 +3612,9 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, + .cpu_dead = intel_pmu_cpu_dead, + + .check_period = intel_pmu_check_period, }; static struct attribute *intel_pmu_attrs[]; @@ -3560,8 +3654,12 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, + .cpu_dead = intel_pmu_cpu_dead, + .guest_get_msrs = intel_guest_get_msrs, .sched_task = intel_pmu_sched_task, + + .check_period = intel_pmu_check_period, }; static __init void intel_clovertown_quirk(void) @@ -3881,8 +3979,11 @@ static struct attribute *intel_pmu_caps_attrs[] = { NULL }; +static DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort); + static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, + NULL, /* &dev_attr_allow_tsx_force_abort.attr.attr */ NULL, }; @@ -3996,6 +4097,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; + x86_pmu.limit_period = nhm_limit_period; x86_pmu.cpu_events = nhm_events_attrs; @@ -4015,11 +4117,11 @@ __init int intel_pmu_init(void) name = "nehalem"; break; - case INTEL_FAM6_ATOM_PINEVIEW: - case INTEL_FAM6_ATOM_LINCROFT: - case INTEL_FAM6_ATOM_PENWELL: - case INTEL_FAM6_ATOM_CLOVERVIEW: - case INTEL_FAM6_ATOM_CEDARVIEW: + case INTEL_FAM6_ATOM_BONNELL: + case INTEL_FAM6_ATOM_BONNELL_MID: + case INTEL_FAM6_ATOM_SALTWELL: + case INTEL_FAM6_ATOM_SALTWELL_MID: + case INTEL_FAM6_ATOM_SALTWELL_TABLET: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -4032,9 +4134,11 @@ __init int intel_pmu_init(void) name = "bonnell"; break; - case INTEL_FAM6_ATOM_SILVERMONT1: - case INTEL_FAM6_ATOM_SILVERMONT2: + case INTEL_FAM6_ATOM_SILVERMONT: + case INTEL_FAM6_ATOM_SILVERMONT_X: + case INTEL_FAM6_ATOM_SILVERMONT_MID: case INTEL_FAM6_ATOM_AIRMONT: + case INTEL_FAM6_ATOM_AIRMONT_MID: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, @@ -4053,7 +4157,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_DENVERTON: + case INTEL_FAM6_ATOM_GOLDMONT_X: memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, @@ -4079,7 +4183,7 @@ __init int intel_pmu_init(void) name = "goldmont"; break; - case INTEL_FAM6_ATOM_GEMINI_LAKE: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs, @@ -4338,6 +4442,15 @@ __init int intel_pmu_init(void) x86_pmu.cpu_events = get_hsw_events_attrs(); intel_pmu_pebs_data_source_skl( boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); + + if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + x86_pmu.flags |= PMU_FL_TFA; + x86_pmu.get_event_constraints = tfa_get_event_constraints; + x86_pmu.enable_all = intel_tfa_pmu_enable_all; + x86_pmu.commit_scheduling = intel_tfa_commit_scheduling; + intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr.attr; + } + pr_cont("Skylake events, "); name = "skylake"; break; @@ -4479,7 +4592,7 @@ static __init int fixup_ht_bug(void) hardlockup_detector_perf_restart(); for_each_online_cpu(c) - free_excl_cntrs(c); + free_excl_cntrs(&per_cpu(cpu_hw_events, c)); cpus_read_unlock(); pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n"); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 357e82dc0e2a..59521c71c98a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -543,8 +543,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates), @@ -563,9 +563,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 25386be0d757..3310f9f6c3e1 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -681,7 +681,7 @@ struct event_constraint intel_core2_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01), EVENT_CONSTRAINT_END }; @@ -690,7 +690,7 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01), /* Allow all events as PEBS with no flags */ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), EVENT_CONSTRAINT_END @@ -698,7 +698,7 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { struct event_constraint intel_slm_pebs_event_constraints[] = { /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x1), /* Allow all events as PEBS with no flags */ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), EVENT_CONSTRAINT_END @@ -729,7 +729,7 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f), EVENT_CONSTRAINT_END }; @@ -746,7 +746,7 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = { INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f), EVENT_CONSTRAINT_END }; @@ -755,7 +755,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf), INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -770,9 +770,9 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf), /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2), INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -786,9 +786,9 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf), /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ @@ -809,9 +809,9 @@ struct event_constraint intel_bdw_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf), /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ @@ -832,9 +832,9 @@ struct event_constraint intel_bdw_pebs_event_constraints[] = { struct event_constraint intel_skl_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2), /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f), INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 005908ee9333..d36a5fac6a18 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -775,9 +775,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init), {}, }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index a68aba8a482f..6b66285c6ced 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1221,6 +1221,8 @@ static struct pci_driver snbep_uncore_pci_driver = { .id_table = snbep_uncore_pci_ids, }; +#define NODE_ID_MASK 0x7 + /* * build pci bus to socket mapping */ @@ -1242,7 +1244,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); if (err) break; - nodeid = config; + nodeid = config & NODE_ID_MASK; /* get the Node ID mapping */ err = pci_read_config_dword(ubox_dev, idmap_loc, &config); if (err) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 81dd57280441..5eb0669d0795 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -62,14 +62,14 @@ static bool test_intel(int idx) case INTEL_FAM6_BROADWELL_GT3E: case INTEL_FAM6_BROADWELL_X: - case INTEL_FAM6_ATOM_SILVERMONT1: - case INTEL_FAM6_ATOM_SILVERMONT2: + case INTEL_FAM6_ATOM_SILVERMONT: + case INTEL_FAM6_ATOM_SILVERMONT_X: case INTEL_FAM6_ATOM_AIRMONT: case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_DENVERTON: + case INTEL_FAM6_ATOM_GOLDMONT_X: - case INTEL_FAM6_ATOM_GEMINI_LAKE: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 3c51fcaf1e34..bfe16631fd1d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -95,25 +95,25 @@ struct amd_nb { PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) -#define PEBS_REGS \ - (PERF_REG_X86_AX | \ - PERF_REG_X86_BX | \ - PERF_REG_X86_CX | \ - PERF_REG_X86_DX | \ - PERF_REG_X86_DI | \ - PERF_REG_X86_SI | \ - PERF_REG_X86_SP | \ - PERF_REG_X86_BP | \ - PERF_REG_X86_IP | \ - PERF_REG_X86_FLAGS | \ - PERF_REG_X86_R8 | \ - PERF_REG_X86_R9 | \ - PERF_REG_X86_R10 | \ - PERF_REG_X86_R11 | \ - PERF_REG_X86_R12 | \ - PERF_REG_X86_R13 | \ - PERF_REG_X86_R14 | \ - PERF_REG_X86_R15) +#define PEBS_GP_REGS \ + ((1ULL << PERF_REG_X86_AX) | \ + (1ULL << PERF_REG_X86_BX) | \ + (1ULL << PERF_REG_X86_CX) | \ + (1ULL << PERF_REG_X86_DX) | \ + (1ULL << PERF_REG_X86_DI) | \ + (1ULL << PERF_REG_X86_SI) | \ + (1ULL << PERF_REG_X86_SP) | \ + (1ULL << PERF_REG_X86_BP) | \ + (1ULL << PERF_REG_X86_IP) | \ + (1ULL << PERF_REG_X86_FLAGS) | \ + (1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) /* * Per register state. @@ -239,6 +239,11 @@ struct cpu_hw_events { int excl_thread_id; /* 0 or 1 */ /* + * SKL TSX_FORCE_ABORT shadow + */ + u64 tfa_shadow; + + /* * AMD specific bits */ struct amd_nb *amd_nb; @@ -639,6 +644,11 @@ struct x86_pmu { * Intel host/guest support (KVM) */ struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); + + /* + * Check period value for PERF_EVENT_IOC_PERIOD ioctl. + */ + int (*check_period) (struct perf_event *event, u64 period); }; struct x86_perf_task_context { @@ -667,6 +677,7 @@ do { \ #define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */ #define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */ #define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */ +#define PMU_FL_TFA 0x20 /* deal with TSX force abort */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -848,7 +859,7 @@ static inline int amd_pmu_init(void) #ifdef CONFIG_CPU_SUP_INTEL -static inline bool intel_pmu_has_bts(struct perf_event *event) +static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period) { struct hw_perf_event *hwc = &event->hw; unsigned int hw_event, bts_event; @@ -859,7 +870,14 @@ static inline bool intel_pmu_has_bts(struct perf_event *event) hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); - return hw_event == bts_event && hwc->sample_period == 1; + return hw_event == bts_event && period == 1; +} + +static inline bool intel_pmu_has_bts(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + return intel_pmu_has_bts_period(event, hwc->sample_period); } int intel_pmu_save_and_restart(struct perf_event *event); @@ -868,7 +886,8 @@ struct event_constraint * x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event); -struct intel_shared_regs *allocate_shared_regs(int cpu); +extern int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu); +extern void intel_cpuc_finish(struct cpu_hw_events *cpuc); int intel_pmu_init(void); @@ -1002,9 +1021,13 @@ static inline int intel_pmu_init(void) return 0; } -static inline struct intel_shared_regs *allocate_shared_regs(int cpu) +static inline int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) +{ + return 0; +} + +static inline void intel_cpuc_finish(struct cpu_hw_events *cpuc) { - return NULL; } static inline int is_ht_workaround_enabled(void) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 2e9d58cc371e..924fa9c07368 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -17,6 +17,7 @@ * */ +#include <linux/efi.h> #include <linux/types.h> #include <asm/hypervisor.h> #include <asm/hyperv.h> @@ -101,6 +102,22 @@ static int hv_cpu_init(unsigned int cpu) return 0; } +static int __init hv_pci_init(void) +{ + int gen2vm = efi_enabled(EFI_BOOT); + + /* + * For Generation-2 VM, we exit from pci_arch_init() by returning 0. + * The purpose is to suppress the harmless warning: + * "PCI: Fatal: No config space access function found" + */ + if (gen2vm) + return 0; + + /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ + return 1; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -108,7 +125,7 @@ static int hv_cpu_init(unsigned int cpu) * 1. Setup the hypercall page. * 2. Register Hyper-V specific clocksource. */ -void hyperv_init(void) +void __init hyperv_init(void) { u64 guest_id, required_msrs; union hv_x64_msr_hypercall_contents hypercall_msr; @@ -154,6 +171,8 @@ void hyperv_init(void) hyper_alloc_mmu(); + x86_init.pci.arch_init = hv_pci_init; + /* * Register Hyper-V specific clocksource. */ diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 56c9ebac946f..47718fff0b79 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -57,12 +57,14 @@ static inline int fill_gva_list(u64 gva_list[], int offset, * Lower 12 bits encode the number of additional * pages to flush (in addition to the 'cur' page). */ - if (diff >= HV_TLB_FLUSH_UNIT) + if (diff >= HV_TLB_FLUSH_UNIT) { gva_list[gva_n] |= ~PAGE_MASK; - else if (diff) + cur += HV_TLB_FLUSH_UNIT; + } else if (diff) { gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + cur = end; + } - cur += HV_TLB_FLUSH_UNIT; gva_n++; } while (cur < end); diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 8e02b30cf08e..3ebd77770f98 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -51,7 +51,7 @@ static unsigned long get_dr(int n) /* * fill in the user structure for a core dump.. */ -static void dump_thread32(struct pt_regs *regs, struct user32 *dump) +static void fill_dump(struct pt_regs *regs, struct user32 *dump) { u32 fs, gs; memset(dump, 0, sizeof(*dump)); @@ -157,10 +157,12 @@ static int aout_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); has_dumped = 1; + + fill_dump(cprm->regs, &dump); + strncpy(dump.u_comm, current->comm, sizeof(current->comm)); dump.u_ar0 = offsetof(struct user32, regs); dump.signal = cprm->siginfo->si_signo; - dump_thread32(cprm->regs, &dump); /* * If the size of the dump file exceeds the rlimit, then see diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 41c671854642..789284d19b55 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -62,9 +62,8 @@ } while (0) #define RELOAD_SEG(seg) { \ - unsigned int pre = GET_SEG(seg); \ + unsigned int pre = (seg) | 3; \ unsigned int cur = get_user_seg(seg); \ - pre |= 3; \ if (pre != cur) \ set_user_seg(seg, pre); \ } @@ -73,6 +72,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_32 __user *sc) { unsigned int tmpflags, err = 0; + u16 gs, fs, es, ds; void __user *buf; u32 tmp; @@ -80,16 +80,10 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; get_user_try { - /* - * Reload fs and gs if they have changed in the signal - * handler. This does not handle long fs/gs base changes in - * the handler, but does not clobber them at least in the - * normal case. - */ - RELOAD_SEG(gs); - RELOAD_SEG(fs); - RELOAD_SEG(ds); - RELOAD_SEG(es); + gs = GET_SEG(gs); + fs = GET_SEG(fs); + ds = GET_SEG(ds); + es = GET_SEG(es); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); COPY(ax); @@ -107,6 +101,17 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, buf = compat_ptr(tmp); } get_user_catch(err); + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ + RELOAD_SEG(gs); + RELOAD_SEG(fs); + RELOAD_SEG(ds); + RELOAD_SEG(es); + err |= fpu__restore_sig(buf, 1); force_iret(); diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a1ed92aae12a..25a5a5c6ae90 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -48,7 +48,7 @@ static inline void generic_apic_probe(void) #ifdef CONFIG_X86_LOCAL_APIC -extern unsigned int apic_verbosity; +extern int apic_verbosity; extern int local_apic_timer_c2_ok; extern int disable_apic; diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 72759f131cc5..d09dd91dd0b6 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -50,7 +50,7 @@ static __always_inline void atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) - : "ir" (i)); + : "ir" (i) : "memory"); } /** @@ -64,7 +64,7 @@ static __always_inline void atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) - : "ir" (i)); + : "ir" (i) : "memory"); } /** @@ -90,7 +90,7 @@ static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) static __always_inline void atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" - : "+m" (v->counter)); + : "+m" (v->counter) :: "memory"); } /** @@ -102,7 +102,7 @@ static __always_inline void atomic_inc(atomic_t *v) static __always_inline void atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" - : "+m" (v->counter)); + : "+m" (v->counter) :: "memory"); } /** diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 738495caf05f..e6fad6bbb2ee 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -45,7 +45,7 @@ static __always_inline void atomic64_add(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) - : "er" (i), "m" (v->counter)); + : "er" (i), "m" (v->counter) : "memory"); } /** @@ -59,7 +59,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" : "=m" (v->counter) - : "er" (i), "m" (v->counter)); + : "er" (i), "m" (v->counter) : "memory"); } /** @@ -86,7 +86,7 @@ static __always_inline void atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) - : "m" (v->counter)); + : "m" (v->counter) : "memory"); } /** @@ -99,7 +99,7 @@ static __always_inline void atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) - : "m" (v->counter)); + : "m" (v->counter) : "memory"); } /** diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index a04f0c242a28..bc88797cfa61 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -106,8 +106,8 @@ do { \ #endif /* Atomic operations are already serializing on x86 */ -#define __smp_mb__before_atomic() barrier() -#define __smp_mb__after_atomic() barrier() +#define __smp_mb__before_atomic() do { } while (0) +#define __smp_mb__after_atomic() do { } while (0) #include <asm-generic/barrier.h> diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index a07ffd23e4dd..8fa49cf1211d 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -18,6 +18,20 @@ * Note: efi_info is commonly left uninitialized, but that field has a * private magic, so it is better to leave it unchanged. */ + +#define sizeof_mbr(type, member) ({ sizeof(((type *)0)->member); }) + +#define BOOT_PARAM_PRESERVE(struct_member) \ + { \ + .start = offsetof(struct boot_params, struct_member), \ + .len = sizeof_mbr(struct boot_params, struct_member), \ + } + +struct boot_params_to_save { + unsigned int start; + unsigned int len; +}; + static void sanitize_boot_params(struct boot_params *boot_params) { /* @@ -36,19 +50,41 @@ static void sanitize_boot_params(struct boot_params *boot_params) */ if (boot_params->sentinel) { /* fields in boot_params are left uninitialized, clear them */ - memset(&boot_params->ext_ramdisk_image, 0, - (char *)&boot_params->efi_info - - (char *)&boot_params->ext_ramdisk_image); - memset(&boot_params->kbd_status, 0, - (char *)&boot_params->hdr - - (char *)&boot_params->kbd_status); - memset(&boot_params->_pad7[0], 0, - (char *)&boot_params->edd_mbr_sig_buffer[0] - - (char *)&boot_params->_pad7[0]); - memset(&boot_params->_pad8[0], 0, - (char *)&boot_params->eddbuf[0] - - (char *)&boot_params->_pad8[0]); - memset(&boot_params->_pad9[0], 0, sizeof(boot_params->_pad9)); + static struct boot_params scratch; + char *bp_base = (char *)boot_params; + char *save_base = (char *)&scratch; + int i; + + const struct boot_params_to_save to_save[] = { + BOOT_PARAM_PRESERVE(screen_info), + BOOT_PARAM_PRESERVE(apm_bios_info), + BOOT_PARAM_PRESERVE(tboot_addr), + BOOT_PARAM_PRESERVE(ist_info), + BOOT_PARAM_PRESERVE(hd0_info), + BOOT_PARAM_PRESERVE(hd1_info), + BOOT_PARAM_PRESERVE(sys_desc_table), + BOOT_PARAM_PRESERVE(olpc_ofw_header), + BOOT_PARAM_PRESERVE(efi_info), + BOOT_PARAM_PRESERVE(alt_mem_k), + BOOT_PARAM_PRESERVE(scratch), + BOOT_PARAM_PRESERVE(e820_entries), + BOOT_PARAM_PRESERVE(eddbuf_entries), + BOOT_PARAM_PRESERVE(edd_mbr_sig_buf_entries), + BOOT_PARAM_PRESERVE(edd_mbr_sig_buffer), + BOOT_PARAM_PRESERVE(secure_boot), + BOOT_PARAM_PRESERVE(hdr), + BOOT_PARAM_PRESERVE(e820_table), + BOOT_PARAM_PRESERVE(eddbuf), + }; + + memset(&scratch, 0, sizeof(scratch)); + + for (i = 0; i < ARRAY_SIZE(to_save); i++) { + memcpy(save_base + to_save[i].start, + bp_base + to_save[i].start, to_save[i].len); + } + + memcpy(boot_params, save_base, sizeof(*boot_params)); } } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 70eddb3922ff..4e2d03135854 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -22,8 +22,8 @@ enum cpuid_leafs CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, - CPUID_F_0_EDX, - CPUID_F_1_EDX, + CPUID_LNX_4, + CPUID_DUMMY, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 7d910827126b..b4bef819d5d5 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -239,12 +239,14 @@ #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */ #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ @@ -269,13 +271,18 @@ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0xf, etc. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */ +#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ +#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ +#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ @@ -339,6 +346,8 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ +#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ @@ -376,5 +385,10 @@ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ #define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ +#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ +#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ +#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h index a7adb2bfbf0b..6b8ad6fa3979 100644 --- a/arch/x86/include/asm/crash.h +++ b/arch/x86/include/asm/crash.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_CRASH_H #define _ASM_X86_CRASH_H +struct kimage; + int crash_load_segments(struct kimage *image); int crash_copy_backup_region(struct kimage *image); int crash_setup_memmap_entries(struct kimage *image, diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index a399c1ebf6f0..96fd0251f8f5 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -82,8 +82,7 @@ struct efi_scratch { #define arch_efi_call_virt_setup() \ ({ \ efi_sync_low_kernel_mappings(); \ - preempt_disable(); \ - __kernel_fpu_begin(); \ + kernel_fpu_begin(); \ firmware_restrict_branch_speculation_start(); \ \ if (efi_scratch.use_pgd) { \ @@ -104,8 +103,7 @@ struct efi_scratch { } \ \ firmware_restrict_branch_speculation_end(); \ - __kernel_fpu_end(); \ - preempt_enable(); \ + kernel_fpu_end(); \ }) extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 6390bd8c141b..5e12b2319d7a 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -159,7 +159,7 @@ extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); -void native_set_fixmap(enum fixed_addresses idx, +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags); #ifndef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index a9caac9d4a72..b56d504af654 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -12,17 +12,12 @@ #define _ASM_X86_FPU_API_H /* - * Careful: __kernel_fpu_begin/end() must be called with preempt disabled - * and they don't touch the preempt state on their own. - * If you enable preemption after __kernel_fpu_begin(), preempt notifier - * should call the __kernel_fpu_end() to prevent the kernel/user FPU - * state from getting corrupted. KVM for example uses this model. - * - * All other cases use kernel_fpu_begin/end() which disable preemption - * during kernel FPU usage. + * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It + * disables preemption so be careful if you intend to use it for long periods + * of time. + * If you intend to use the FPU in softirq you need to check first with + * irq_fpu_usable() if it is possible. */ -extern void __kernel_fpu_begin(void); -extern void __kernel_fpu_end(void); extern void kernel_fpu_begin(void); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 69dcdf195b61..fa2c93cb42a2 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -106,6 +106,9 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); #define user_insn(insn, output, input...) \ ({ \ int err; \ + \ + might_fault(); \ + \ asm volatile(ASM_STAC "\n" \ "1:" #insn "\n\t" \ "2: " ASM_CLAC "\n" \ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 35a6bc4da8ad..5cd7d4e1579d 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -6,7 +6,7 @@ * "Big Core" Processors (Branded as Core, Xeon, etc...) * * The "_X" parts are generally the EP and EX Xeons, or the - * "Extreme" ones, like Broadwell-E. + * "Extreme" ones, like Broadwell-E, or Atom microserver. * * Things ending in "2" are usually because we have no better * name for them. There's no processor called "SILVERMONT2". @@ -51,19 +51,24 @@ /* "Small Core" Processors (Atom) */ -#define INTEL_FAM6_ATOM_PINEVIEW 0x1C -#define INTEL_FAM6_ATOM_LINCROFT 0x26 -#define INTEL_FAM6_ATOM_PENWELL 0x27 -#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 -#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 -#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ -#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ -#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ -#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ -#define INTEL_FAM6_ATOM_GOLDMONT 0x5C -#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ -#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A +#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ +#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ + +#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ +#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ +#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ + +#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ +#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */ +#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ + +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ +#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ + +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ +#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ +#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ +#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */ /* Xeon Phi */ diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 15450a675031..c99c66b41e53 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -6,6 +6,8 @@ #ifndef __ASSEMBLY__ +#include <asm/nospec-branch.h> + /* Provide __cpuidle; we can't safely include <linux/cpu.h> */ #define __cpuidle __attribute__((__section__(".cpuidle.text"))) @@ -54,11 +56,13 @@ static inline void native_irq_enable(void) static inline __cpuidle void native_safe_halt(void) { + mds_idle_clear_cpu_buffers(); asm volatile("sti; hlt": : :"memory"); } static inline __cpuidle void native_halt(void) { + mds_idle_clear_cpu_buffers(); asm volatile("hlt": : :"memory"); } diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index f327236f0fa7..5125fca472bb 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -67,7 +67,7 @@ struct kimage; /* Memory to backup during crash kdump */ #define KEXEC_BACKUP_SRC_START (0UL) -#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ +#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ /* * CPU does not save ss and sp on stack if execution is already diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 72fac8646e9b..d0e17813a9b0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -277,6 +277,7 @@ struct kvm_rmap_head { struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + struct list_head lpage_disallowed_link; /* * The following two entries are used to key the shadow page in the @@ -289,6 +290,7 @@ struct kvm_mmu_page { /* hold the gfn of each spte inside spt */ gfn_t *gfns; bool unsync; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ @@ -509,6 +511,7 @@ struct kvm_vcpu_arch { bool tpr_access_reporting; u64 ia32_xss; u64 microcode_version; + u64 arch_capabilities; /* * Paging state of the vcpu @@ -778,6 +781,7 @@ struct kvm_arch { */ struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; + struct list_head lpage_disallowed_mmu_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; @@ -853,6 +857,8 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + + struct task_struct *nx_lpage_recovery_thread; }; struct kvm_vm_stat { @@ -866,6 +872,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong nx_lpage_splits; ulong max_mmu_page_hash_collisions; }; @@ -972,7 +979,7 @@ struct kvm_x86_ops { unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*tlb_flush)(struct kvm_vcpu *vcpu); + void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); @@ -997,7 +1004,7 @@ struct kvm_x86_ops { void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); - void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); + void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); @@ -1076,6 +1083,7 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); + bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); @@ -1121,7 +1129,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); @@ -1352,25 +1360,29 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +asmlinkage void __noreturn kvm_spurious_fault(void); + /* * Hardware virtualization extension instructions may fault if a * reboot turns off virtualization while processes are running. - * Trap the fault and ignore the instruction if that happens. + * Usually after catching the fault we just panic; during reboot + * instead the instruction is ignored. */ -asmlinkage void kvm_spurious_fault(void); - -#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ - "666: " insn "\n\t" \ - "668: \n\t" \ - ".pushsection .fixup, \"ax\" \n" \ - "667: \n\t" \ - cleanup_insn "\n\t" \ - "cmpb $0, kvm_rebooting \n\t" \ - "jne 668b \n\t" \ - __ASM_SIZE(push) " $666b \n\t" \ - "jmp kvm_spurious_fault \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(666b, 667b) +#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ + "666: \n\t" \ + insn "\n\t" \ + "jmp 668f \n\t" \ + "667: \n\t" \ + "call kvm_spurious_fault \n\t" \ + "668: \n\t" \ + ".pushsection .fixup, \"ax\" \n\t" \ + "700: \n\t" \ + cleanup_insn "\n\t" \ + "cmpb $0, kvm_rebooting\n\t" \ + "je 667b \n\t" \ + "jmp 668b \n\t" \ + ".popsection \n\t" \ + _ASM_EXTABLE(666b, 700b) #define __kvm_handle_fault_on_reboot(insn) \ ____kvm_handle_fault_on_reboot(insn, "") diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index fed3636dce9a..5761a86b88e0 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_MSR_INDEX_H #define _ASM_X86_MSR_INDEX_H +#include <linux/bits.h> + /* * CPU model specific register (MSR) numbers. * @@ -40,14 +42,14 @@ /* Intel MSRs. Some also available on other CPUs */ #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ -#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ +#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */ #define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */ -#define SPEC_CTRL_STIBP (1 << SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ +#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ -#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ +#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ -#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ +#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ #define MSR_PPIN_CTL 0x0000004e #define MSR_PPIN 0x0000004f @@ -69,24 +71,45 @@ #define MSR_MTRRcap 0x000000fe #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a -#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ -#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ -#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */ -#define ARCH_CAP_SSB_NO (1 << 4) /* - * Not susceptible to Speculative Store Bypass - * attack, so no Speculative Store Bypass - * control required. - */ +#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ +#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ +#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ +#define ARCH_CAP_SSB_NO BIT(4) /* + * Not susceptible to Speculative Store Bypass + * attack, so no Speculative Store Bypass + * control required. + */ +#define ARCH_CAP_MDS_NO BIT(5) /* + * Not susceptible to + * Microarchitectural Data + * Sampling (MDS) vulnerabilities. + */ +#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* + * The processor is not susceptible to a + * machine check error due to modifying the + * code page size along with either the + * physical address or cache type + * without TLB invalidation. + */ +#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ +#define ARCH_CAP_TAA_NO BIT(8) /* + * Not susceptible to + * TSX Async Abort (TAA) vulnerabilities. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b -#define L1D_FLUSH (1 << 0) /* - * Writeback and invalidate the - * L1 data cache. - */ +#define L1D_FLUSH BIT(0) /* + * Writeback and invalidate the + * L1 data cache. + */ #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e +#define MSR_IA32_TSX_CTRL 0x00000122 +#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ +#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 @@ -327,6 +350,7 @@ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 @@ -610,6 +634,12 @@ #define MSR_IA32_TSC_DEADLINE 0x000006E0 + +#define MSR_TSX_FORCE_ABORT 0x0000010F + +#define MSR_TFA_RTM_FORCE_ABORT_BIT 0 +#define MSR_TFA_RTM_FORCE_ABORT BIT_ULL(MSR_TFA_RTM_FORCE_ABORT_BIT) + /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 #define MSR_IA32_MCG_EBX 0x00000181 diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 39a2fb29378a..3aa82deeab5a 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -6,6 +6,7 @@ #include <linux/sched/idle.h> #include <asm/cpufeature.h> +#include <asm/nospec-branch.h> #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf @@ -20,7 +21,7 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 #define MWAITX_ECX_TIMER_ENABLE BIT(1) #define MWAITX_MAX_LOOPS ((u32)-1) -#define MWAITX_DISABLE_CSTATES 0xf +#define MWAITX_DISABLE_CSTATES 0xf0 static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) @@ -40,6 +41,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx, static inline void __mwait(unsigned long eax, unsigned long ecx) { + mds_idle_clear_cpu_buffers(); + /* "mwait %eax, %ecx;" */ asm volatile(".byte 0x0f, 0x01, 0xc9;" :: "a" (eax), "c" (ecx)); @@ -74,6 +77,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) static inline void __mwaitx(unsigned long eax, unsigned long ebx, unsigned long ecx) { + /* No MDS buffer clear as this is AMD/HYGON only */ + /* "mwaitx %eax, %ebx, %ecx;" */ asm volatile(".byte 0x0f, 0x01, 0xfb;" :: "a" (eax), "b" (ebx), "c" (ecx)); @@ -81,6 +86,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx, static inline void __sti_mwait(unsigned long eax, unsigned long ecx) { + mds_idle_clear_cpu_buffers(); + trace_hardirqs_on(); /* "mwait %eax, %ecx;" */ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index a633767419f2..b73a16a56e4f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -202,7 +202,7 @@ " lfence;\n" \ " jmp 902b;\n" \ " .align 16\n" \ - "903: addl $4, %%esp;\n" \ + "903: lea 4(%%esp), %%esp;\n" \ " pushl %[thunk_target];\n" \ " ret;\n" \ " .align 16\n" \ @@ -317,6 +317,56 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); +DECLARE_STATIC_KEY_FALSE(mds_user_clear); +DECLARE_STATIC_KEY_FALSE(mds_idle_clear); + +#include <asm/segment.h> + +/** + * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability + * + * This uses the otherwise unused and obsolete VERW instruction in + * combination with microcode which triggers a CPU buffer flush when the + * instruction is executed. + */ +static inline void mds_clear_cpu_buffers(void) +{ + static const u16 ds = __KERNEL_DS; + + /* + * Has to be the memory-operand variant because only that + * guarantees the CPU buffer flush functionality according to + * documentation. The register-operand variant does not. + * Works with any segment selector, but a valid writable + * data segment is the fastest variant. + * + * "cc" clobber is required because VERW modifies ZF. + */ + asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); +} + +/** + * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability + * + * Clear CPU buffers if the corresponding static key is enabled + */ +static inline void mds_user_clear_cpu_buffers(void) +{ + if (static_branch_likely(&mds_user_clear)) + mds_clear_cpu_buffers(); +} + +/** + * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability + * + * Clear CPU buffers if the corresponding static key is enabled + */ +static inline void mds_idle_clear_cpu_buffers(void) +{ + if (static_branch_likely(&mds_idle_clear)) + mds_clear_cpu_buffers(); +} + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 74d531f6d518..50c8baaca4b0 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -7,7 +7,11 @@ #endif #ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_EXTRA +#define KASAN_STACK_ORDER 2 +#else #define KASAN_STACK_ORDER 1 +#endif #else #define KASAN_STACK_ORDER 0 #endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c83a2f418cea..4471f0da6ed7 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -758,6 +758,7 @@ static __always_inline bool pv_vcpu_is_preempted(long cpu) PV_RESTORE_ALL_CALLER_REGS \ FRAME_END \ "ret;" \ + ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") /* Get a reference to a callee-save function */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 78241b736f2a..f6c4915a863e 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -209,16 +209,20 @@ struct x86_pmu_capability { #define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) #define IBSCTL_LVT_OFFSET_MASK 0x0F -/* ibs fetch bits/masks */ +/* IBS fetch bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) #define IBS_FETCH_CNT 0xFFFF0000ULL #define IBS_FETCH_MAX_CNT 0x0000FFFFULL -/* ibs op bits/masks */ -/* lower 4 bits of the current count are ignored: */ -#define IBS_OP_CUR_CNT (0xFFFF0ULL<<32) +/* + * IBS op bits/masks + * The lower 7 bits of the current count are random bits + * preloaded by hardware and ignored in software + */ +#define IBS_OP_CUR_CNT (0xFFF80ULL<<32) +#define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index ef938583147e..3a33de4133d1 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -56,15 +56,15 @@ struct mm_struct; void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); -static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) +static inline void native_set_pte(pte_t *ptep, pte_t pte) { - *ptep = native_make_pte(0); + WRITE_ONCE(*ptep, pte); } -static inline void native_set_pte(pte_t *ptep, pte_t pte) +static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { - *ptep = pte; + native_set_pte(ptep, native_make_pte(0)); } static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) @@ -74,7 +74,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { - *pmdp = pmd; + WRITE_ONCE(*pmdp, pmd); } static inline void native_pmd_clear(pmd_t *pmd) @@ -110,7 +110,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) static inline void native_set_pud(pud_t *pudp, pud_t pud) { - *pudp = pud; + WRITE_ONCE(*pudp, pud); } static inline void native_pud_clear(pud_t *pud) @@ -220,9 +220,9 @@ static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { #if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) - p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); + WRITE_ONCE(p4dp->pgd, pti_set_user_pgd(&p4dp->pgd, p4d.pgd)); #else - *p4dp = p4d; + WRITE_ONCE(*p4dp, p4d); #endif } @@ -238,9 +238,9 @@ static inline void native_p4d_clear(p4d_t *p4d) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { #ifdef CONFIG_PAGE_TABLE_ISOLATION - *pgdp = pti_set_user_pgd(pgdp, pgd); + WRITE_ONCE(*pgdp, pti_set_user_pgd(pgdp, pgd)); #else - *pgdp = pgd; + WRITE_ONCE(*pgdp, pgd); #endif } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b12c8d70dd33..6a87eda9691e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -988,4 +988,17 @@ enum l1tf_mitigations { extern enum l1tf_mitigations l1tf_mitigation; +enum mds_mitigations { + MDS_MITIGATION_OFF, + MDS_MITIGATION_FULL, + MDS_MITIGATION_VMWERV, +}; + +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 14131dd06b29..8603d127f73c 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -232,23 +232,51 @@ static inline int regs_within_kernel_stack(struct pt_regs *regs, } /** + * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack + * @regs: pt_regs which contains kernel stack pointer. + * @n: stack entry number. + * + * regs_get_kernel_stack_nth() returns the address of the @n th entry of the + * kernel stack which is specified by @regs. If the @n th entry is NOT in + * the kernel stack, this returns NULL. + */ +static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + + addr += n; + if (regs_within_kernel_stack(regs, (unsigned long)addr)) + return addr; + else + return NULL; +} + +/* To avoid include hell, we can't include uaccess.h */ +extern long probe_kernel_read(void *dst, const void *src, size_t size); + +/** * regs_get_kernel_stack_nth() - get Nth entry of the stack * @regs: pt_regs which contains kernel stack pointer. * @n: stack entry number. * * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which - * is specified by @regs. If the @n th entry is NOT in the kernel stack, + * is specified by @regs. If the @n th entry is NOT in the kernel stack * this returns 0. */ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) { - unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); - addr += n; - if (regs_within_kernel_stack(regs, (unsigned long)addr)) - return *addr; - else - return 0; + unsigned long *addr; + unsigned long val; + long ret; + + addr = regs_get_kernel_stack_nth_addr(regs, n); + if (addr) { + ret = probe_kernel_read(&val, addr, sizeof(val)); + if (!ret) + return val; + } + return 0; } #define arch_has_single_step() (1) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index fe2ee61880a8..a14730ca9d1e 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -175,16 +175,6 @@ static inline int wbinvd_on_all_cpus(void) extern unsigned disabled_cpus; #ifdef CONFIG_X86_LOCAL_APIC - -#ifndef CONFIG_X86_64 -static inline int logical_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); -} - -#endif - extern int hard_smp_processor_id(void); #else /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 982c325dad33..8be6afb58471 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -12,7 +12,13 @@ /* image of the saved processor state */ struct saved_context { - u16 es, fs, gs, ss; + /* + * On x86_32, all segment registers, with the possible exception of + * gs, are saved at kernel entry in pt_regs. + */ +#ifdef CONFIG_X86_32_LAZY_GS + u16 gs; +#endif unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; bool misc_enable_saved; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 7306e911faee..a7af9f53c0cb 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -20,8 +20,20 @@ */ struct saved_context { struct pt_regs regs; - u16 ds, es, fs, gs, ss; - unsigned long gs_base, gs_kernel_base, fs_base; + + /* + * User CS and SS are saved in current_pt_regs(). The rest of the + * segment selectors need to be saved and restored here. + */ + u16 ds, es, fs, gs; + + /* + * Usermode FSBASE and GSBASE may not match the fs and gs selectors, + * so we save them separately. We save the kernelmode GSBASE to + * restore percpu access after resume. + */ + unsigned long kernelmode_gs_base, usermode_gs_base, fs_base; + unsigned long cr0, cr2, cr3, cr4, cr8; u64 misc_enable; bool misc_enable_saved; @@ -30,8 +42,7 @@ struct saved_context { u16 gdt_pad; /* Unused */ struct desc_ptr gdt_desc; u16 idt_pad; - u16 idt_limit; - unsigned long idt_base; + struct desc_ptr idt; u16 ldt; u16 tss; unsigned long tr; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 12ef2b49d11b..6a7e83092811 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -41,6 +41,7 @@ asmlinkage void ret_from_fork(void); * order of the fields must match the code in __switch_to_asm(). */ struct inactive_task_frame { + unsigned long flags; #ifdef CONFIG_X86_64 unsigned long r15; unsigned long r14; diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 2ecd34e2d46c..12dd61dc3c79 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -38,4 +38,34 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +#ifndef CONFIG_UML_X86 +static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} + +#define INT3_INSN_SIZE 1 +#define CALL_INSN_SIZE 5 + +#ifdef CONFIG_X86_64 +static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) +{ + /* + * The int3 handler in entry_64.S adds a gap between the + * stack where the break point happened, and the saving of + * pt_regs. We can extend the original stack because of + * this gap. See the idtentry macro's create_gap option. + */ + regs->sp -= sizeof(unsigned long); + *(unsigned long *)regs->sp = val; +} + +static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) +{ + int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); + int3_emulate_jmp(regs, func); +} +#endif /* CONFIG_X86_64 */ +#endif /* !CONFIG_UML_X86 */ + #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index afbc87206886..b771bb3d159b 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -40,7 +40,7 @@ asmlinkage void simd_coprocessor_error(void); asmlinkage void xen_divide_error(void); asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); -asmlinkage void xen_xenint3(void); +asmlinkage void xen_int3(void); asmlinkage void xen_overflow(void); asmlinkage void xen_bounds(void); asmlinkage void xen_invalid_op(void); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index aae77eb8491c..971830341061 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -293,8 +293,7 @@ do { \ __put_user_asm(x, ptr, retval, "l", "k", "ir", errret); \ break; \ case 8: \ - __put_user_asm_u64((__typeof__(*ptr))(x), ptr, retval, \ - errret); \ + __put_user_asm_u64(x, ptr, retval, errret); \ break; \ default: \ __put_user_bad(); \ @@ -440,8 +439,10 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ int __pu_err; \ + __typeof__(*(ptr)) __pu_val; \ + __pu_val = x; \ __uaccess_begin(); \ - __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ + __put_user_size(__pu_val, (ptr), (size), __pu_err, -EFAULT);\ __uaccess_end(); \ __builtin_expect(__pu_err, 0); \ }) @@ -450,8 +451,10 @@ do { \ ({ \ int __gu_err; \ __inttype(*(ptr)) __gu_val; \ + __typeof__(ptr) __gu_ptr = (ptr); \ + __typeof__(size) __gu_size = (size); \ __uaccess_begin_nospec(); \ - __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __get_user_size(__gu_val, __gu_ptr, __gu_size, __gu_err, -EFAULT); \ __uaccess_end(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __builtin_expect(__gu_err, 0); \ diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 1f86e1b0a5cd..499578f7e6d7 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -23,6 +23,12 @@ struct unwind_state { #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; + /* + * If non-NULL: The current frame is incomplete and doesn't contain a + * valid BP. When looking for the next frame, use this instead of the + * non-existent saved BP. + */ + unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index e652a7cc6186..3f697a9e3f59 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -48,7 +48,8 @@ enum { BIOS_STATUS_SUCCESS = 0, BIOS_STATUS_UNIMPLEMENTED = -ENOSYS, BIOS_STATUS_EINVAL = -EINVAL, - BIOS_STATUS_UNAVAIL = -EBUSY + BIOS_STATUS_UNAVAIL = -EBUSY, + BIOS_STATUS_ABORT = -EINTR, }; /* Address map parameters */ @@ -167,4 +168,9 @@ extern long system_serial_number; extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ +/* + * EFI runtime lock; cf. firmware/efi/runtime-wrappers.c for details + */ +extern struct semaphore __efi_uv_runtime_lock; + #endif /* _ASM_X86_UV_BIOS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index bfd882617613..e7e625448008 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -217,6 +217,9 @@ privcmd_call(unsigned call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); + if (call >= PAGE_SIZE / sizeof(hypercall_page[0])) + return -EINVAL; + stac(); asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2e64178f284d..6415b4aead54 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -182,7 +182,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); /* * Debug level, exported for io_apic.c */ -unsigned int apic_verbosity; +int apic_verbosity; int pic_mode; @@ -723,7 +723,7 @@ static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; /* - * Temporary interrupt handler. + * Temporary interrupt handler and polled calibration function. */ static void __init lapic_cal_handler(struct clock_event_device *dev) { @@ -807,7 +807,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); - void (*real_handler)(struct clock_event_device *dev); + u64 tsc_perj = 0, tsc_start = 0; + unsigned long jif_start; unsigned long deltaj; long delta, deltatsc; int pm_referenced = 0; @@ -838,28 +839,64 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" "calibrating APIC timer ...\n"); + /* + * There are platforms w/o global clockevent devices. Instead of + * making the calibration conditional on that, use a polling based + * approach everywhere. + */ local_irq_disable(); - /* Replace the global interrupt handler */ - real_handler = global_clock_event->event_handler; - global_clock_event->event_handler = lapic_cal_handler; - /* * Setup the APIC counter to maximum. There is no way the lapic * can underflow in the 100ms detection time frame */ __setup_APIC_LVTT(0xffffffff, 0, 0); - /* Let the interrupts run */ + /* + * Methods to terminate the calibration loop: + * 1) Global clockevent if available (jiffies) + * 2) TSC if available and frequency is known + */ + jif_start = READ_ONCE(jiffies); + + if (tsc_khz) { + tsc_start = rdtsc(); + tsc_perj = div_u64((u64)tsc_khz * 1000, HZ); + } + + /* + * Enable interrupts so the tick can fire, if a global + * clockevent device is available + */ local_irq_enable(); - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) { + /* Wait for a tick to elapse */ + while (1) { + if (tsc_khz) { + u64 tsc_now = rdtsc(); + if ((tsc_now - tsc_start) >= tsc_perj) { + tsc_start += tsc_perj; + break; + } + } else { + unsigned long jif_now = READ_ONCE(jiffies); - local_irq_disable(); + if (time_after(jif_now, jif_start)) { + jif_start = jif_now; + break; + } + } + cpu_relax(); + } - /* Restore the real event handler */ - global_clock_event->event_handler = real_handler; + /* Invoke the calibration routine */ + local_irq_disable(); + lapic_cal_handler(NULL); + local_irq_enable(); + } + + local_irq_disable(); /* Build delta t1-t2 as apic timer counts down */ delta = lapic_cal_t1 - lapic_cal_t2; @@ -912,10 +949,11 @@ static int __init calibrate_APIC_clock(void) levt->features &= ~CLOCK_EVT_FEAT_DUMMY; /* - * PM timer calibration failed or not turned on - * so lets try APIC timer based calibration + * PM timer calibration failed or not turned on so lets try APIC + * timer based calibration, if a global clockevent device is + * available. */ - if (!pm_referenced) { + if (!pm_referenced && global_clock_event) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); /* @@ -1324,6 +1362,56 @@ static void lapic_setup_esr(void) oldvalue, value); } +static void apic_pending_intr_clear(void) +{ + long long max_loops = cpu_khz ? cpu_khz : 1000000; + unsigned long long tsc = 0, ntsc; + unsigned int value, queued; + int i, j, acked = 0; + + if (boot_cpu_has(X86_FEATURE_TSC)) + tsc = rdtsc(); + /* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now CPU has serviced that pending interrupt and + * it might not have done the ack_APIC_irq() because it thought, + * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it + * does not clear the ISR bit and cpu thinks it has already serivced + * the interrupt. Hence a vector might get locked. It was noticed + * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + */ + do { + queued = 0; + for (i = APIC_ISR_NR - 1; i >= 0; i--) + queued |= apic_read(APIC_IRR + i*0x10); + + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1<<j)) { + ack_APIC_irq(); + acked++; + } + } + } + if (acked > 256) { + printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", + acked); + break; + } + if (queued) { + if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { + ntsc = rdtsc(); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } + } while (queued && max_loops > 0); + WARN_ON(max_loops <= 0); +} + /** * setup_local_APIC - setup the local APIC * @@ -1333,19 +1421,22 @@ static void lapic_setup_esr(void) void setup_local_APIC(void) { int cpu = smp_processor_id(); - unsigned int value, queued; - int i, j, acked = 0; - unsigned long long tsc = 0, ntsc; - long long max_loops = cpu_khz ? cpu_khz : 1000000; + unsigned int value; - if (boot_cpu_has(X86_FEATURE_TSC)) - tsc = rdtsc(); if (disable_apic) { disable_ioapic_support(); return; } + /* + * If this comes from kexec/kcrash the APIC might be enabled in + * SPIV. Soft disable it before doing further initialization. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); + #ifdef CONFIG_X86_32 /* Pound the ESR really hard over the head with a big hammer - mbligh */ if (lapic_is_integrated() && apic->disable_esr) { @@ -1371,16 +1462,21 @@ void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches the - * actual value. - */ - i = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - WARN_ON(i != BAD_APICID && i != logical_smp_processor_id()); - /* always use the value from LDR */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = - logical_smp_processor_id(); + if (apic->dest_logical) { + int logical_apicid, ldr_apicid; + + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches + * the actual value. + */ + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + if (logical_apicid != BAD_APICID) + WARN_ON(logical_apicid != ldr_apicid); + /* Always use the value from LDR. */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + } #endif /* @@ -1391,45 +1487,7 @@ void setup_local_APIC(void) value &= ~APIC_TPRI_MASK; apic_write(APIC_TASKPRI, value); - /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. - */ - do { - queued = 0; - for (i = APIC_ISR_NR - 1; i >= 0; i--) - queued |= apic_read(APIC_IRR + i*0x10); - - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1<<j)) { - ack_APIC_irq(); - acked++; - } - } - } - if (acked > 256) { - printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", - acked); - break; - } - if (queued) { - if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { - ntsc = rdtsc(); - max_loops = (cpu_khz << 10) - (ntsc - tsc); - } else - max_loops--; - } - } while (queued && max_loops > 0); - WARN_ON(max_loops <= 0); + apic_pending_intr_clear(); /* * Now that we are all set up, enable the APIC diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index e12fbcfc9571..a527f2a712b4 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -38,32 +38,12 @@ static int bigsmp_early_logical_apicid(int cpu) return early_per_cpu(x86_cpu_to_apicid, cpu); } -static inline unsigned long calculate_ldr(int cpu) -{ - unsigned long val, id; - - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - id = per_cpu(x86_bios_cpu_apicid, cpu); - val |= SET_APIC_LOGICAL_ID(id); - - return val; -} - /* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... + * bigsmp enables physical destination mode + * and doesn't use LDR and DFR */ static void bigsmp_init_apic_ldr(void) { - unsigned long val; - int cpu = smp_processor_id(); - - apic_write(APIC_DFR, APIC_DFR_FLAT); - val = calculate_ldr(cpu); - apic_write(APIC_LDR, val); } static void bigsmp_setup_apic_routing(void) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 96a8a68f9c79..2271adbc3c42 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1690,9 +1690,10 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) static inline bool ioapic_irqd_mask(struct irq_data *data) { - /* If we are moving the irq we need to mask it */ + /* If we are moving the IRQ we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic_irq(data); + if (!irqd_irq_masked(data)) + mask_ioapic_irq(data); return true; } return false; @@ -1729,7 +1730,9 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) */ if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic_irq(data); + /* If the IRQ is masked in the core, leave it: */ + if (!irqd_irq_masked(data)) + unmask_ioapic_irq(data); } } #else @@ -2342,7 +2345,13 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use * gsi_top if ioapic_dynirq_base hasn't been initialized yet. */ - return ioapic_initialized ? ioapic_dynirq_base : gsi_top; + if (!ioapic_initialized) + return gsi_top; + /* + * For DT enabled machines ioapic_dynirq_base is irrelevant and not + * updated. So simply return @from if ioapic_dynirq_base == 0. + */ + return ioapic_dynirq_base ? : from; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 570e8bb1f386..e13ddd19a76c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o tsx.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e03515662c0..3914f9218a6b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -772,6 +772,64 @@ static void init_amd_ln(struct cpuinfo_x86 *c) msr_set_bit(MSR_AMD64_DE_CFG, 31); } +static bool rdrand_force; + +static int __init rdrand_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "force")) + rdrand_force = true; + else + return -EINVAL; + + return 0; +} +early_param("rdrand", rdrand_cmdline); + +static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) +{ + /* + * Saving of the MSR used to hide the RDRAND support during + * suspend/resume is done by arch/x86/power/cpu.c, which is + * dependent on CONFIG_PM_SLEEP. + */ + if (!IS_ENABLED(CONFIG_PM_SLEEP)) + return; + + /* + * The nordrand option can clear X86_FEATURE_RDRAND, so check for + * RDRAND support using the CPUID function directly. + */ + if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) + return; + + msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62); + + /* + * Verify that the CPUID change has occurred in case the kernel is + * running virtualized and the hypervisor doesn't support the MSR. + */ + if (cpuid_ecx(1) & BIT(30)) { + pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n"); + return; + } + + clear_cpu_cap(c, X86_FEATURE_RDRAND); + pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n"); +} + +static void init_amd_jg(struct cpuinfo_x86 *c) +{ + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); +} + static void init_amd_bd(struct cpuinfo_x86 *c) { u64 value; @@ -786,16 +844,24 @@ static void init_amd_bd(struct cpuinfo_x86 *c) wrmsrl_safe(MSR_F15H_IC_CFG, value); } } + + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); } static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); + /* - * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects - * all up to and including B1. + * Fix erratum 1076: CPB feature bit not being set in CPUID. + * Always set it, except when running under a hypervisor. */ - if (c->x86_model <= 1 && c->x86_stepping <= 1) + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB)) set_cpu_cap(c, X86_FEATURE_CPB); } @@ -827,6 +893,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x16: init_amd_jg(c); break; case 0x17: init_amd_zn(c); break; } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 004e60470a77..7896a34f53b5 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -26,14 +26,21 @@ #include <asm/vmx.h> #include <asm/paravirt.h> #include <asm/alternative.h> +#include <asm/hypervisor.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/intel-family.h> #include <asm/e820/api.h> +#include "cpu.h" + +static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); +static void __init mds_select_mitigation(void); +static void __init mds_print_mitigation(void); +static void __init taa_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -60,6 +67,13 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); +/* Control MDS CPU buffer clear before returning to user space */ +DEFINE_STATIC_KEY_FALSE(mds_user_clear); +EXPORT_SYMBOL_GPL(mds_user_clear); +/* Control MDS CPU buffer clear before idling (halt, mwait) */ +DEFINE_STATIC_KEY_FALSE(mds_idle_clear); +EXPORT_SYMBOL_GPL(mds_idle_clear); + void __init check_bugs(void) { identify_boot_cpu(); @@ -68,7 +82,7 @@ void __init check_bugs(void) * identify_boot_cpu() initialized SMT support information, let the * core code know. */ - cpu_smt_check_topology_early(); + cpu_smt_check_topology(); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); @@ -87,16 +101,21 @@ void __init check_bugs(void) if (boot_cpu_has(X86_FEATURE_STIBP)) x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; - /* Select the proper spectre mitigation before patching alternatives */ + /* Select the proper CPU mitigations before patching alternatives: */ + spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); + ssb_select_mitigation(); + l1tf_select_mitigation(); + mds_select_mitigation(); + taa_select_mitigation(); /* - * Select proper mitigation for any exposure to the Speculative Store - * Bypass vulnerability. + * As MDS and TAA mitigations are inter-related, print MDS + * mitigation until after TAA mitigation selection is done. */ - ssb_select_mitigation(); + mds_print_mitigation(); - l1tf_select_mitigation(); + arch_smt_update(); #ifdef CONFIG_X86_32 /* @@ -204,6 +223,266 @@ static void x86_amd_ssb_disable(void) } #undef pr_fmt +#define pr_fmt(fmt) "MDS: " fmt + +/* Default mitigation for MDS-affected CPUs */ +static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; +static bool mds_nosmt __ro_after_init = false; + +static const char * const mds_strings[] = { + [MDS_MITIGATION_OFF] = "Vulnerable", + [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", + [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", +}; + +static void __init mds_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { + mds_mitigation = MDS_MITIGATION_OFF; + return; + } + + if (mds_mitigation == MDS_MITIGATION_FULL) { + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) + mds_mitigation = MDS_MITIGATION_VMWERV; + + static_branch_enable(&mds_user_clear); + + if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && + (mds_nosmt || cpu_mitigations_auto_nosmt())) + cpu_smt_disable(false); + } +} + +static void __init mds_print_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; + + pr_info("%s\n", mds_strings[mds_mitigation]); +} + +static int __init mds_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + mds_mitigation = MDS_MITIGATION_OFF; + else if (!strcmp(str, "full")) + mds_mitigation = MDS_MITIGATION_FULL; + else if (!strcmp(str, "full,nosmt")) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_nosmt = true; + } + + return 0; +} +early_param("mds", mds_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "TAA: " fmt + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static bool taa_nosmt __ro_after_init; + +static const char * const taa_strings[] = { + [TAA_MITIGATION_OFF] = "Vulnerable", + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", +}; + +static void __init taa_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TSX previously disabled by tsx=off */ + if (!boot_cpu_has(X86_FEATURE_RTM)) { + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; + goto out; + } + + if (cpu_mitigations_off()) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* + * TAA mitigation via VERW is turned off if both + * tsx_async_abort=off and mds=off are specified. + */ + if (taa_mitigation == TAA_MITIGATION_OFF && + mds_mitigation == MDS_MITIGATION_OFF) + goto out; + + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ( (ia32_cap & ARCH_CAP_MDS_NO) && + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + static_branch_enable(&mds_user_clear); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + + /* + * Update MDS mitigation, if necessary, as the mds_user_clear is + * now enabled for TAA mitigation. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } +out: + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static int __init tsx_async_abort_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_TAA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + taa_mitigation = TAA_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + taa_mitigation = TAA_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_nosmt = true; + } + + return 0; +} +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V1 : " fmt + +enum spectre_v1_mitigation { + SPECTRE_V1_MITIGATION_NONE, + SPECTRE_V1_MITIGATION_AUTO, +}; + +static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init = + SPECTRE_V1_MITIGATION_AUTO; + +static const char * const spectre_v1_strings[] = { + [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers", + [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization", +}; + +/* + * Does SMAP provide full mitigation against speculative kernel access to + * userspace? + */ +static bool smap_works_speculatively(void) +{ + if (!boot_cpu_has(X86_FEATURE_SMAP)) + return false; + + /* + * On CPUs which are vulnerable to Meltdown, SMAP does not + * prevent speculative access to user data in the L1 cache. + * Consider SMAP to be non-functional as a mitigation on these + * CPUs. + */ + if (boot_cpu_has(X86_BUG_CPU_MELTDOWN)) + return false; + + return true; +} + +static void __init spectre_v1_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { + spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; + return; + } + + if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { + /* + * With Spectre v1, a user can speculatively control either + * path of a conditional swapgs with a user-controlled GS + * value. The mitigation is to add lfences to both code paths. + * + * If FSGSBASE is enabled, the user can put a kernel address in + * GS, in which case SMAP provides no protection. + * + * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the + * FSGSBASE enablement patches have been merged. ] + * + * If FSGSBASE is disabled, the user can only put a user space + * address in GS. That makes an attack harder, but still + * possible if there's no SMAP protection. + */ + if (!smap_works_speculatively()) { + /* + * Mitigation can be provided from SWAPGS itself or + * PTI as the CR3 write in the Meltdown mitigation + * is serializing. + * + * If neither is there, mitigate with an LFENCE to + * stop speculation through swapgs. + */ + if (boot_cpu_has_bug(X86_BUG_SWAPGS) && + !boot_cpu_has(X86_FEATURE_PTI)) + setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER); + + /* + * Enable lfences in the kernel entry (non-swapgs) + * paths, to prevent user entry from speculatively + * skipping swapgs. + */ + setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL); + } + } + + pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]); +} + +static int __init nospectre_v1_cmdline(char *str) +{ + spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; + return 0; +} +early_param("nospectre_v1", nospectre_v1_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = @@ -271,7 +550,7 @@ static const struct { const char *option; enum spectre_v2_user_cmd cmd; bool secure; -} v2_user_options[] __initdata = { +} v2_user_options[] __initconst = { { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, @@ -406,7 +685,7 @@ static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; -} mitigation_options[] __initdata = { +} mitigation_options[] __initconst = { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, @@ -427,7 +706,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || + cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); @@ -559,9 +839,6 @@ specv2_set_mode: /* Set up IBPB and STIBP depending on the general spectre V2 command */ spectre_v2_user_select_mitigation(cmd); - - /* Enable STIBP if appropriate */ - arch_smt_update(); } static void update_stibp_msr(void * __unused) @@ -595,12 +872,34 @@ static void update_indir_branch_cond(void) static_branch_disable(&switch_to_cond_stibp); } -void arch_smt_update(void) +#undef pr_fmt +#define pr_fmt(fmt) fmt + +/* Update the static key controlling the MDS CPU buffer clear in idle */ +static void update_mds_branch_idle(void) { - /* Enhanced IBRS implies STIBP. No update required. */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + /* + * Enable the idle clearing if SMT is active on CPUs which are + * affected only by MSBDS and not any other MDS variant. + * + * The other variants cannot be mitigated when SMT is enabled, so + * clearing the buffers on idle just to prevent the Store Buffer + * repartitioning leak would be a window dressing exercise. + */ + if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) return; + if (sched_smt_active()) + static_branch_enable(&mds_idle_clear); + else + static_branch_disable(&mds_idle_clear); +} + +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" + +void arch_smt_update(void) +{ mutex_lock(&spec_ctrl_mutex); switch (spectre_v2_user) { @@ -615,6 +914,28 @@ void arch_smt_update(void) break; } + switch (mds_mitigation) { + case MDS_MITIGATION_FULL: + case MDS_MITIGATION_VMWERV: + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + pr_warn_once(MDS_MSG_SMT); + update_mds_branch_idle(); + break; + case MDS_MITIGATION_OFF: + break; + } + + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -642,7 +963,7 @@ static const char * const ssb_strings[] = { static const struct { const char *option; enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initdata = { +} ssb_mitigation_options[] __initconst = { { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ @@ -656,7 +977,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || + cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; } else { ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", @@ -718,6 +1040,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) } /* + * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper + * bit in the mask to allow guests to use the mitigation even in the + * case where the host does not enable it. + */ + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) { + x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; + } + + /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass @@ -734,7 +1066,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); } } @@ -920,6 +1251,9 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +bool itlb_multihit_kvm_mitigation; +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); + #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt @@ -977,6 +1311,11 @@ static void __init l1tf_select_mitigation(void) if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; + if (cpu_mitigations_off()) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { @@ -1005,7 +1344,7 @@ static void __init l1tf_select_mitigation(void) pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", half_pa); pr_info("However, doing so will make a part of your RAM unusable.\n"); - pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n"); + pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n"); return; } @@ -1038,6 +1377,7 @@ static int __init l1tf_cmdline(char *str) early_param("l1tf", l1tf_cmdline); #undef pr_fmt +#define pr_fmt(fmt) fmt #ifdef CONFIG_SYSFS @@ -1069,13 +1409,58 @@ static ssize_t l1tf_show_state(char *buf) l1tf_vmx_states[l1tf_vmx_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + if (itlb_multihit_kvm_mitigation) + return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + else + return sprintf(buf, "KVM: Vulnerable\n"); +} #else static ssize_t l1tf_show_state(char *buf) { return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sprintf(buf, "Processor vulnerable\n"); +} #endif +static ssize_t mds_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + mds_strings[mds_mitigation]); + } + + if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : + sched_smt_active() ? "mitigated" : "disabled")); + } + + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + +static ssize_t tsx_async_abort_show_state(char *buf) +{ + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || + (taa_mitigation == TAA_MITIGATION_OFF)) + return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) @@ -1120,7 +1505,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr break; case X86_BUG_SPECTRE_V1: - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], @@ -1137,6 +1522,16 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) return l1tf_show_state(buf); break; + + case X86_BUG_MDS: + return mds_show_state(buf); + + case X86_BUG_TAA: + return tsx_async_abort_show_state(buf); + + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + default: break; } @@ -1168,4 +1563,19 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b { return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); } + +ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_MDS); +} + +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); +} + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 51e49f6fe8e1..3d805e8b3739 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -768,6 +768,30 @@ static void init_speculation_control(struct cpuinfo_x86 *c) } } +static void init_cqm(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + } +} + void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -799,33 +823,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_D_1_EAX] = eax; } - /* Additional Intel-defined flags: level 0x0000000F */ - if (c->cpuid_level >= 0x0000000F) { - - /* QoS sub-leaf, EAX=0Fh, ECX=0 */ - cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_0_EDX] = edx; - - if (cpu_has(c, X86_FEATURE_CQM_LLC)) { - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = ebx; - - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_1_EDX] = edx; - - if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) || - ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) || - (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) { - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } - } else { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - } - } - /* AMD-defined flags: level 0x80000001 */ eax = cpuid_eax(0x80000000); c->extended_cpuid_level = eax; @@ -863,6 +860,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); + init_cqm(c); /* * Clear/Set all flags overridden by options, after probe. @@ -899,85 +897,136 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) c->x86_cache_bits = c->x86_phys_bits; } -static const __initconst struct x86_cpu_id cpu_no_speculation[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_CENTAUR, 5 }, - { X86_VENDOR_INTEL, 5 }, - { X86_VENDOR_NSC, 5 }, - { X86_VENDOR_ANY, 4 }, - {} -}; +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) -static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { - { X86_VENDOR_AMD }, - {} -}; +#define VULNWL(_vendor, _family, _model, _whitelist) \ + { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } -static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, - { X86_VENDOR_CENTAUR, 5, }, - { X86_VENDOR_INTEL, 5, }, - { X86_VENDOR_NSC, 5, }, - { X86_VENDOR_AMD, 0x12, }, - { X86_VENDOR_AMD, 0x11, }, - { X86_VENDOR_AMD, 0x10, }, - { X86_VENDOR_AMD, 0xf, }, - { X86_VENDOR_ANY, 4, }, - {} -}; +#define VULNWL_INTEL(model, whitelist) \ + VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) + +#define VULNWL_AMD(family, whitelist) \ + VULNWL(AMD, family, X86_MODEL_ANY, whitelist) + +static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + + /* Intel Family 6 */ + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(CORE_YONAH, NO_SSB), + + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), -static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { - /* in addition to cpu_no_speculation */ - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, + /* + * Technically, swapgs isn't serializing on AMD (despite it previously + * being documented as such in the APM). But according to AMD, %gs is + * updated non-speculatively, and the issuing of %gs-relative memory + * operands will be blocked until the %gs update completes, which is + * good enough for our purposes. + */ + + VULNWL_INTEL(ATOM_TREMONT_X, NO_ITLB_MULTIHIT), + + /* AMD Family 0xf - 0x12 */ + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + + /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), {} }; -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +static bool __init cpu_matches(unsigned long which) +{ + const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); + + return m && !!(m->driver_data & which); +} + +u64 x86_read_arch_cap_msr(void) { u64 ia32_cap = 0; - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); - if (!x86_match_cpu(cpu_no_spec_store_bypass) && - !(ia32_cap & ARCH_CAP_SSB_NO) && - !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) - setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + return ia32_cap; +} + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); - if (x86_match_cpu(cpu_no_speculation)) + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + + if (cpu_matches(NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); - if (x86_match_cpu(cpu_no_meltdown)) + if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { + setup_force_cpu_bug(X86_BUG_MDS); + if (cpu_matches(MSBDS_ONLY)) + setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); + } + + if (!cpu_matches(NO_SWAPGS)) + setup_force_cpu_bug(X86_BUG_SWAPGS); + + /* + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: + * - TSX is supported or + * - TSX_CTRL is present + * + * TSX_CTRL check is needed for cases when TSX could be disabled before + * the kernel boot e.g. kexec. + * TSX_CTRL check alone is not sufficient for cases when the microcode + * update is not present or running as guest that don't get TSX_CTRL. + */ + if (!(ia32_cap & ARCH_CAP_TAA_NO) && + (cpu_has(c, X86_FEATURE_RTM) || + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + setup_force_cpu_bug(X86_BUG_TAA); + + if (cpu_matches(NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ @@ -986,7 +1035,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - if (x86_match_cpu(cpu_no_l1tf)) + if (cpu_matches(NO_L1TF)) return; setup_force_cpu_bug(X86_BUG_L1TF); @@ -1017,6 +1066,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) memset(&c->x86_capability, 0, sizeof c->x86_capability); c->extended_cpuid_level = 0; + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + /* cyrix could have cpuid enabled via c_identify()*/ if (have_cpuid_p()) { cpu_detect(c); @@ -1033,7 +1085,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); } else { - identify_cpu_without_cpuid(c); setup_clear_cpu_cap(X86_FEATURE_CPUID); } @@ -1387,6 +1438,7 @@ void __init identify_boot_cpu(void) enable_sep_cpu(); #endif cpu_detect_tlb(&boot_cpu_data); + tsx_init(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index cca588407dca..db10a63687d3 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -45,6 +45,22 @@ struct _tlb_table { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +#ifdef CONFIG_CPU_SUP_INTEL +enum tsx_ctrl_states { + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_NOT_SUPPORTED, +}; + +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; + +extern void __init tsx_init(void); +extern void tsx_enable(void); +extern void tsx_disable(void); +#else +static inline void tsx_init(void) { } +#endif /* CONFIG_CPU_SUP_INTEL */ + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); extern int detect_extended_topology_early(struct cpuinfo_x86 *c); @@ -54,4 +70,6 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern u64 x86_read_arch_cap_msr(void); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 904b0a3c4e53..4c9fc6a4d1ea 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -59,6 +59,9 @@ const static struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, {} }; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 8949b7ae6d92..1d9b8aaea06c 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -124,7 +124,7 @@ static void set_cx86_reorder(void) setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* Load/Store Serialize to mem access disable (=reorder it) */ - setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80); + setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); /* set load/store serialize from 1GB to 4GB */ ccr3 |= 0xe0; setCx86(CX86_CCR3, ccr3); @@ -135,11 +135,11 @@ static void set_cx86_memwb(void) pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ - setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04); + setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); /* set 'Not Write-through' */ write_cr0(read_cr0() | X86_CR0_NW); /* CCR2 bit 2: lock NW bit and set WT1 */ - setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14); + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); } /* @@ -153,14 +153,14 @@ static void geode_configure(void) local_irq_save(flags); /* Suspend on halt power saving and enable #SUSP pin */ - setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88); + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* FPU fast, DTE cache, Mem bypass */ - setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38); + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ set_cx86_memwb(); @@ -296,7 +296,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { /* Enable cxMMX extensions (GX1 Datasheet 54) */ - setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1); + setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); /* * GXm : 0x30 ... 0x5f GXm datasheet 51 @@ -319,7 +319,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) if (dir1 > 7) { dir0_msn++; /* M II */ /* Enable MMX extensions (App note 108) */ - setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1); + setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); } else { /* A 6x86MX - it has the bug. */ set_cpu_bug(c, X86_BUG_COMA); @@ -437,7 +437,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c) /* enable MAPEN */ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable cpuid */ - setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* disable MAPEN */ setCx86(CX86_CCR3, ccr3); local_irq_restore(flags); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 574dcdc092ab..3a5ea741701b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -695,6 +695,11 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_energy_perf(c); init_intel_misc_features(c); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 665d0f6cd62f..3f731d7f04bf 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -526,7 +526,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) if (static_branch_unlikely(&rdt_mon_enable_key)) rmdir_mondata_subdir_allrdtgrp(r, d->id); list_del(&d->list); - if (is_mbm_enabled()) + if (r->mon_capable && is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { /* diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index f6ea94f8954a..f892cb0b485e 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -313,6 +313,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } md.priv = of->kn->priv; resid = md.u.rid; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 2dae1b3c42fc..0ec30b2384c0 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1107,7 +1107,7 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, if (rdt_mon_capable) { ret = mongroup_create_dir(rdtgroup_default.kn, - NULL, "mon_groups", + &rdtgroup_default, "mon_groups", &kn_mongrp); if (ret) { dentry = ERR_PTR(ret); @@ -1260,7 +1260,11 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { free_rmid(sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); - kfree(sentry); + + if (atomic_read(&sentry->waitcount) != 0) + sentry->flags = RDT_DELETED; + else + kfree(sentry); } } @@ -1294,7 +1298,11 @@ static void rmdir_all_sub(void) kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); - kfree(rdtgrp); + + if (atomic_read(&rdtgrp->waitcount) != 0) + rdtgrp->flags = RDT_DELETED; + else + kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ update_closid_rmid(cpu_online_mask, &rdtgroup_default); @@ -1491,7 +1499,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, /* * Create the mon_data directory first. */ - ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); + ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); if (ret) return ret; @@ -1525,7 +1533,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, uint files = 0; int ret; - prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + prdtgrp = rdtgroup_kn_lock_live(parent_kn); if (!prdtgrp) { ret = -ENODEV; goto out_unlock; @@ -1581,7 +1589,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, kernfs_activate(kn); /* - * The caller unlocks the prgrp_kn upon success. + * The caller unlocks the parent_kn upon success. */ return 0; @@ -1592,7 +1600,7 @@ out_destroy: out_free_rgrp: kfree(rdtgrp); out_unlock: - rdtgroup_kn_unlock(prgrp_kn); + rdtgroup_kn_unlock(parent_kn); return ret; } @@ -1630,7 +1638,7 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, */ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); - rdtgroup_kn_unlock(prgrp_kn); + rdtgroup_kn_unlock(parent_kn); return ret; } @@ -1667,7 +1675,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, * Create an empty mon_groups directory to hold the subset * of tasks and cpus to monitor. */ - ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); + ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); if (ret) goto out_id_free; } @@ -1680,7 +1688,7 @@ out_id_free: out_common_fail: mkdir_rdt_prepare_clean(rdtgrp); out_unlock: - rdtgroup_kn_unlock(prgrp_kn); + rdtgroup_kn_unlock(parent_kn); return ret; } @@ -1792,11 +1800,6 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, closid_free(rdtgrp->closid); free_rmid(rdtgrp->mon.rmid); - /* - * Free all the child monitor group rmids. - */ - free_all_child_rdtgrp(rdtgrp); - list_del(&rdtgrp->rdtgroup_list); /* @@ -1806,6 +1809,11 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, kernfs_get(kn); kernfs_remove(rdtgrp->kn); + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 8fec687b3e44..f12141ba9a76 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -108,6 +108,9 @@ static void setup_inj_struct(struct mce *m) memset(m, 0, sizeof(struct mce)); m->cpuvendor = boot_cpu_data.x86_vendor; + m->time = ktime_get_real_seconds(); + m->cpuid = cpuid_eax(1); + m->microcode = boot_cpu_data.microcode; } /* Update fake mce registers on current CPU. */ @@ -576,6 +579,9 @@ static int inj_bank_set(void *data, u64 val) m->bank = val; do_inject(); + /* Reset injection struct */ + setup_inj_struct(&i_mce); + return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c51353569492..20d133ec3ef9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -148,6 +148,11 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), KERNEL ), + MCESEV( + PANIC, "Instruction fetch error in kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + KERNEL + ), #endif MCESEV( PANIC, "Action required: unknown MCACOD", diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 98e4e4dc4a3b..0b0e44f85393 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -701,19 +701,49 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) barrier(); m.status = mce_rdmsrl(msr_ops.status(i)); + + /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) continue; /* - * Uncorrected or signalled events are handled by the exception - * handler when it is enabled, so don't process those here. - * - * TBD do the same check for MCI_STATUS_EN here? + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. */ - if (!(flags & MCP_UC) && - (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) - continue; + if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + goto log_it; + /* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ + if (!mca_cfg.ser) { + if (m.status & MCI_STATUS_UC) + continue; + goto log_it; + } + + /* Log "not enabled" (speculative) errors */ + if (!(m.status & MCI_STATUS_EN)) + goto log_it; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + goto log_it; + + /* + * Skip anything else. Presumption is that our read of this + * bank is racing with a machine check. Leave the log alone + * for do_machine_check() to deal with it. + */ + continue; + +log_it: error_seen = true; mce_read_aux(&m, i); @@ -772,6 +802,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); + m->bank = i; if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { mce_read_aux(m, i); *msg = tmp; @@ -1629,36 +1660,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 0x15 && c->x86_model <= 0xf) mce_flags.overflow_recov = 1; - /* - * Turn off MC4_MISC thresholding banks on those models since - * they're not supported there. - */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { - int i; - u64 hwcr; - bool need_toggle; - u32 msrs[] = { - 0x00000413, /* MC4_MISC0 */ - 0xc0000408, /* MC4_MISC1 */ - }; - - rdmsrl(MSR_K7_HWCR, hwcr); - - /* McStatusWrEn has to be set */ - need_toggle = !(hwcr & BIT(18)); - - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); - - /* Clear CntP bit safely */ - for (i = 0; i < ARRAY_SIZE(msrs); i++) - msr_clear_bit(msrs[i], 62); - - /* restore old settings */ - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); - } } if (c->x86_vendor == X86_VENDOR_INTEL) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 4fa97a44e73f..a8f47697276b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -228,10 +228,10 @@ static void smca_configure(unsigned int bank, unsigned int cpu) } /* Return early if this bank was already initialized. */ - if (smca_banks[bank].hwid) + if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } @@ -544,6 +544,40 @@ out: return offset; } +/* + * Turn off MC4_MISC thresholding banks on all family 0x15 models since + * they're not supported there. + */ +void disable_err_thresholding(struct cpuinfo_x86 *c) +{ + int i; + u64 hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + if (c->x86 != 0x15) + return; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + /* Clear CntP bit safely */ + for (i = 0; i < ARRAY_SIZE(msrs); i++) + msr_clear_bit(msrs[i], 62); + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -551,6 +585,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; + disable_err_thresholding(c); + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) smca_configure(bank, cpu); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index ee229ceee745..ec6a07b04fdb 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -185,7 +185,7 @@ static void therm_throt_process(bool new_event, int event, int level) /* if we just entered the thermal event */ if (new_event) { if (event == THERMAL_THROTTLING_EVENT) - pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + pr_warn("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 9d33dbf2489e..d0a61d3e2fb9 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -707,7 +707,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) if (!p) { return ret; } else { - if (boot_cpu_data.microcode == p->patch_id) + if (boot_cpu_data.microcode >= p->patch_id) return ret; ret = UCODE_NEW; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 387a8f44fba1..93c22e7ee424 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -418,8 +418,9 @@ static int do_microcode_update(const void __user *buf, size_t size) if (ustate == UCODE_ERROR) { error = -1; break; - } else if (ustate == UCODE_OK) + } else if (ustate == UCODE_NEW) { apply_microcode_on_target(cpu); + } } return error; @@ -789,13 +790,16 @@ static struct syscore_ops mc_syscore_ops = { .resume = mc_bp_resume, }; -static int mc_cpu_online(unsigned int cpu) +static int mc_cpu_starting(unsigned int cpu) { - struct device *dev; - - dev = get_cpu_device(cpu); microcode_update_cpu(cpu); pr_debug("CPU%d added\n", cpu); + return 0; +} + +static int mc_cpu_online(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); if (sysfs_create_group(&dev->kobj, &mc_attr_group)) pr_err("Failed to create group for CPU%d\n", cpu); @@ -872,6 +876,8 @@ int __init microcode_init(void) goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); + cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", + mc_cpu_starting, NULL); cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index d0dfb892c72f..aed45b8895d5 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -4,6 +4,8 @@ # Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h # +set -e + IN=$1 OUT=$2 diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index df11f5d604be..ed7ce5184a77 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -21,6 +21,10 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c new file mode 100644 index 000000000000..3e20d322bc98 --- /dev/null +++ b/arch/x86/kernel/cpu/tsx.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Transactional Synchronization Extensions (TSX) control. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Pawan Gupta <pawan.kumar.gupta@linux.intel.com> + */ + +#include <linux/cpufeature.h> + +#include <asm/cmdline.h> + +#include "cpu.h" + +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; + +void tsx_disable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Force all transactions to immediately abort */ + tsx |= TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is not enumerated in CPUID. + * This is visible to userspace and will ensure they + * do not waste resources trying TSX transactions that + * will always abort. + */ + tsx |= TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +void tsx_enable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Enable the RTM feature in the cpu */ + tsx &= ~TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is enumerated in CPUID. + * This is visible to userspace and will ensure they + * can enumerate and use the TSX feature. + */ + tsx &= ~TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static bool __init tsx_ctrl_is_supported(void) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); +} + +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) +{ + if (boot_cpu_has_bug(X86_BUG_TAA)) + return TSX_CTRL_DISABLE; + + return TSX_CTRL_ENABLE; +} + +void __init tsx_init(void) +{ + char arg[5] = {}; + int ret; + + if (!tsx_ctrl_is_supported()) + return; + + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); + if (ret >= 0) { + if (!strcmp(arg, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(arg, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(arg, "auto")) { + tsx_ctrl_state = x86_get_tsx_auto_mode(); + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("tsx: invalid option, defaulting to off\n"); + } + } else { + /* tsx= not provided */ + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) + tsx_ctrl_state = x86_get_tsx_auto_mode(); + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; + } + + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { + tsx_disable(); + + /* + * tsx_disable() will change the state of the + * RTM CPUID bit. Clear it here since it is now + * expected to be not set. + */ + setup_clear_cpu_cap(X86_FEATURE_RTM); + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { + + /* + * HW defaults TSX to be enabled at bootup. + * We may still need the TSX enable support + * during init for special cases like + * kexec after TSX is disabled. + */ + tsx_enable(); + + /* + * tsx_enable() will change the state of the + * RTM CPUID bit. Force it here since it is now + * expected to be set. + */ + setup_force_cpu_cap(X86_FEATURE_RTM); + } +} diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 2ea85b32421a..2e5003fef51a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -93,7 +93,7 @@ bool irq_fpu_usable(void) } EXPORT_SYMBOL(irq_fpu_usable); -void __kernel_fpu_begin(void) +static void __kernel_fpu_begin(void) { struct fpu *fpu = ¤t->thread.fpu; @@ -111,9 +111,8 @@ void __kernel_fpu_begin(void) __cpu_invalidate_fpregs_state(); } } -EXPORT_SYMBOL(__kernel_fpu_begin); -void __kernel_fpu_end(void) +static void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; @@ -122,7 +121,6 @@ void __kernel_fpu_end(void) kernel_fpu_enable(); } -EXPORT_SYMBOL(__kernel_fpu_end); void kernel_fpu_begin(void) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 7acb87cb2da8..ccc2b9d2956a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -22,6 +22,7 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/module.h> +#include <linux/memory.h> #include <trace/syscall.h> @@ -30,11 +31,13 @@ #include <asm/sections.h> #include <asm/ftrace.h> #include <asm/nops.h> +#include <asm/text-patching.h> #ifdef CONFIG_DYNAMIC_FTRACE int ftrace_arch_code_modify_prepare(void) { + mutex_lock(&text_mutex); set_kernel_text_rw(); set_all_modules_text_rw(); return 0; @@ -44,6 +47,7 @@ int ftrace_arch_code_modify_post_process(void) { set_all_modules_text_ro(); set_kernel_text_ro(); + mutex_unlock(&text_mutex); return 0; } @@ -229,6 +233,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, } static unsigned long ftrace_update_func; +static unsigned long ftrace_update_func_call; static int update_ftrace_func(unsigned long ip, void *new) { @@ -257,6 +262,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func) unsigned char *new; int ret; + ftrace_update_func_call = (unsigned long)func; + new = ftrace_call_replace(ip, (unsigned long)func); ret = update_ftrace_func(ip, new); @@ -292,13 +299,28 @@ int ftrace_int3_handler(struct pt_regs *regs) if (WARN_ON_ONCE(!regs)) return 0; - ip = regs->ip - 1; - if (!ftrace_location(ip) && !is_ftrace_caller(ip)) - return 0; + ip = regs->ip - INT3_INSN_SIZE; - regs->ip += MCOUNT_INSN_SIZE - 1; +#ifdef CONFIG_X86_64 + if (ftrace_location(ip)) { + int3_emulate_call(regs, (unsigned long)ftrace_regs_caller); + return 1; + } else if (is_ftrace_caller(ip)) { + if (!ftrace_update_func_call) { + int3_emulate_jmp(regs, ip + CALL_INSN_SIZE); + return 1; + } + int3_emulate_call(regs, ftrace_update_func_call); + return 1; + } +#else + if (ftrace_location(ip) || is_ftrace_caller(ip)) { + int3_emulate_jmp(regs, ip + CALL_INSN_SIZE); + return 1; + } +#endif - return 1; + return 0; } static int ftrace_write(unsigned long ip, const char *val, int size) @@ -869,6 +891,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) func = ftrace_ops_get_func(ops); + ftrace_update_func_call = (unsigned long)func; + /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); ret = update_ftrace_func(ip, new); @@ -965,6 +989,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func) { unsigned char *new; + ftrace_update_func_call = 0UL; new = ftrace_jmp_replace(ip, (unsigned long)func); return update_ftrace_func(ip, new); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 45b5c6c4a55e..e00ccbcc2913 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -117,26 +117,27 @@ unsigned long __head __startup_64(unsigned long physaddr, pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; - i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; - p4d[i + 0] = (pgdval_t)pud + pgtable_flags; - p4d[i + 1] = (pgdval_t)pud + pgtable_flags; + i = physaddr >> P4D_SHIFT; + p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; } else { i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; pgd[i + 0] = (pgdval_t)pud + pgtable_flags; pgd[i + 1] = (pgdval_t)pud + pgtable_flags; } - i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; - pud[i + 0] = (pudval_t)pmd + pgtable_flags; - pud[i + 1] = (pudval_t)pmd + pgtable_flags; + i = physaddr >> PUD_SHIFT; + pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { - int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD; - pmd[idx] = pmd_entry + i * PMD_SIZE; + int idx = i + (physaddr >> PMD_SHIFT); + + pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; } /* @@ -144,13 +145,31 @@ unsigned long __head __startup_64(unsigned long physaddr, * we might write invalid pmds, when the kernel is relocated * cleanup_highmap() fixes this up along with the mappings * beyond _end. + * + * Only the region occupied by the kernel image has so far + * been checked against the table of usable memory regions + * provided by the firmware, so invalidate pages outside that + * region. A page table entry that maps to a reserved area of + * memory would allow processor speculation into that area, + * and on some hardware (particularly the UV platform) even + * speculative access to some reserved areas is caught as an + * error, causing the BIOS to halt the system. */ pmd = fixup_pointer(level2_kernel_pgt, physaddr); - for (i = 0; i < PTRS_PER_PMD; i++) { + + /* invalidate pages before the kernel image */ + for (i = 0; i < pmd_index((unsigned long)_text); i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* fixup pages that are part of the kernel image */ + for (; i <= pmd_index((unsigned long)_end); i++) if (pmd[i] & _PAGE_PRESENT) pmd[i] += load_delta; - } + + /* invalidate pages after the kernel image */ + for (; i < PTRS_PER_PMD; i++) + pmd[i] &= ~_PAGE_PRESENT; /* * Fixup phys_base - remove the memory encryption mask to obtain diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index afa1a204bc6d..df767e6de8dd 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -909,6 +909,8 @@ int __init hpet_enable(void) return 0; hpet_set_mapping(); + if (!hpet_virt_address) + return 0; /* * Read the period and check for a sane value: diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 8771766d46b6..9954a604a822 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -352,6 +352,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) #endif default: WARN_ON_ONCE(1); + return -EINVAL; } /* diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 0469cd078db1..b50ac9c7397b 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,9 +26,18 @@ int sysctl_panic_on_stackoverflow; /* * Probabilistic stack overflow check: * - * Only check the stack in process context, because everything else - * runs on the big interrupt stacks. Checking reliably is too expensive, - * so we just check from interrupts. + * Regular device interrupts can enter on the following stacks: + * + * - User stack + * + * - Kernel task stack + * + * - Interrupt stack if a device driver reenables interrupts + * which should only happen in really old drivers. + * + * - Debug IST stack + * + * All other contexts are invalid. */ static inline void stack_overflow_check(struct pt_regs *regs) { @@ -53,8 +62,8 @@ static inline void stack_overflow_check(struct pt_regs *regs) return; oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; - estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; + estack_bottom = (u64)oist->ist[DEBUG_STACK]; + estack_top = estack_bottom - DEBUG_STKSZ + STACK_TOP_MARGIN; if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 928b0c6083c9..4d948d87f01c 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -167,6 +167,9 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, struct efi_info *current_ei = &boot_params.efi_info; struct efi_info *ei = ¶ms->efi_info; + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return 0; + if (!current_ei->efi_memmap_size) return 0; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8e36f249646e..904e18bb38c5 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -438,7 +438,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs) */ void kgdb_roundup_cpus(unsigned long flags) { - apic->send_IPI_allbutself(APIC_DM_NMI); + apic->send_IPI_allbutself(NMI_VECTOR); } #endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 65452d555f05..9d7bb8de2917 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -553,6 +553,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) unsigned long *sara = stack_addr(regs); ri->ret_addr = (kprobe_opcode_t *) *sara; + ri->fp = sara; /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; @@ -743,26 +744,48 @@ asm( NOKPROBE_SYMBOL(kretprobe_trampoline); STACK_FRAME_NON_STANDARD(kretprobe_trampoline); +static struct kprobe kretprobe_kprobe = { + .addr = (void *)kretprobe_trampoline, +}; + /* * Called from kretprobe_trampoline */ __visible __used void *trampoline_handler(struct pt_regs *regs) { + struct kprobe_ctlblk *kcb; struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; struct hlist_node *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; kprobe_opcode_t *correct_ret_addr = NULL; + void *frame_pointer; + bool skipped = false; + + preempt_disable(); + + /* + * Set a dummy kprobe for avoiding kretprobe recursion. + * Since kretprobe never run in kprobe handler, kprobe must not + * be running at this point. + */ + kcb = get_kprobe_ctlblk(); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; + /* On x86-64, we use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); regs->gs = 0; + /* On x86-32, we use pt_regs->flags for return address holder. */ + frame_pointer = ®s->flags; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; @@ -784,8 +807,25 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be poped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } orig_ret_address = (unsigned long)ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); if (orig_ret_address != trampoline_address) /* @@ -803,14 +843,15 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + if (ri->fp != frame_pointer) + continue; orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { __this_cpu_write(current_kprobe, &ri->rp->kp); - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, NULL); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); } recycle_rp_inst(ri, &empty_rp); @@ -826,6 +867,9 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) kretprobe_hash_unlock(current, &flags); + __this_cpu_write(current_kprobe, NULL); + preempt_enable(); + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3668f28cf5fc..f4e4db0cbd59 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -141,6 +141,11 @@ asm ( void optprobe_template_func(void); STACK_FRAME_NON_STANDARD(optprobe_template_func); +NOKPROBE_SYMBOL(optprobe_template_func); +NOKPROBE_SYMBOL(optprobe_template_entry); +NOKPROBE_SYMBOL(optprobe_template_val); +NOKPROBE_SYMBOL(optprobe_template_call); +NOKPROBE_SYMBOL(optprobe_template_end); #define TMPL_MOVE_IDX \ ((long)&optprobe_template_val - (long)&optprobe_template_entry) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 652bdd867782..5853eb50138e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -631,6 +631,7 @@ asm( "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" "setne %al;" "ret;" +".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" ".popsection"); #endif diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index bc6bc6689e68..236abf77994b 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -544,17 +544,15 @@ void __init default_get_smp_config(unsigned int early) * local APIC has default address */ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - return; + goto out; } pr_info("Default MP configuration #%d\n", mpf->feature1); construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { - if (check_physptr(mpf, early)) { - early_memunmap(mpf, sizeof(*mpf)); - return; - } + if (check_physptr(mpf, early)) + goto out; } else BUG(); @@ -563,7 +561,7 @@ void __init default_get_smp_config(unsigned int early) /* * Only use the first configuration found. */ - +out: early_memunmap(mpf, sizeof(*mpf)); } @@ -596,8 +594,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf_base = base; mpf_found = true; - pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", - base, base + sizeof(*mpf) - 1, mpf); + pr_info("found SMP MP-table at [mem %#010lx-%#010lx]\n", + base, base + sizeof(*mpf) - 1); memblock_reserve(base, sizeof(*mpf)); if (mpf->physptr) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 35aafc95e4b8..d796a7f6a74a 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -34,6 +34,7 @@ #include <asm/x86_init.h> #include <asm/reboot.h> #include <asm/cache.h> +#include <asm/nospec-branch.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -533,6 +534,9 @@ nmi_restart: write_cr2(this_cpu_read(nmi_cr2)); if (this_cpu_dec_return(nmi_state)) goto nmi_restart; + + if (user_mode(regs)) + mds_user_clear_cpu_buffers(); } NOKPROBE_SYMBOL(do_nmi); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index a98d1cdd6299..d2ef967bfafb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -465,10 +465,12 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) void speculation_ctrl_update(unsigned long tif) { + unsigned long flags; + /* Forced update. Make sure all relevant TIF flags are different */ - preempt_disable(); + local_irq_save(flags); __speculation_ctrl_update(~tif, tif); - preempt_enable(); + local_irq_restore(flags); } /* Called from seccomp/prctl update */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c2df91eab573..df6bb5f8ab2a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -132,6 +132,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct task_struct *tsk; int err; + /* + * For a new task use the RESET flags value since there is no before. + * All the status flags are zero; DF and all the system flags must also + * be 0, specifically IF must be 0 because we context switch to the new + * task with interrupts disabled. + */ + frame->flags = X86_EFLAGS_FIXED; frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ec63d6be5e02..d1dfd1397b46 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -278,6 +278,14 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; + + /* + * For a new task use the RESET flags value since there is no before. + * All the status flags are zero; DF and all the system flags must also + * be 0, specifically IF must be 0 because we context switch to the new + * task with interrupts disabled. + */ + frame->flags = X86_EFLAGS_FIXED; frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index ed5c4cdf0a34..734549492a18 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -24,6 +24,7 @@ #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/context_tracking.h> +#include <linux/nospec.h> #include <linux/uaccess.h> #include <asm/pgtable.h> @@ -39,6 +40,7 @@ #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/syscall.h> +#include <asm/mmu_context.h> #include "tls.h" @@ -342,6 +344,49 @@ static int set_segment_reg(struct task_struct *task, return 0; } +static unsigned long task_seg_base(struct task_struct *task, + unsigned short selector) +{ + unsigned short idx = selector >> 3; + unsigned long base; + + if (likely((selector & SEGMENT_TI_MASK) == 0)) { + if (unlikely(idx >= GDT_ENTRIES)) + return 0; + + /* + * There are no user segments in the GDT with nonzero bases + * other than the TLS segments. + */ + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return 0; + + idx -= GDT_ENTRY_TLS_MIN; + base = get_desc_base(&task->thread.tls_array[idx]); + } else { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + + /* + * If performance here mattered, we could protect the LDT + * with RCU. This is a slow path, though, so we can just + * take the mutex. + */ + mutex_lock(&task->mm->context.lock); + ldt = task->mm->context.ldt; + if (unlikely(idx >= ldt->nr_entries)) + base = 0; + else + base = get_desc_base(ldt->entries + idx); + mutex_unlock(&task->mm->context.lock); +#else + base = 0; +#endif + } + + return base; +} + #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -435,18 +480,16 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct, fs_base): { - /* - * XXX: This will not behave as expected if called on - * current or if fsindex != 0. - */ - return task->thread.fsbase; + if (task->thread.fsindex == 0) + return task->thread.fsbase; + else + return task_seg_base(task, task->thread.fsindex); } case offsetof(struct user_regs_struct, gs_base): { - /* - * XXX: This will not behave as expected if called on - * current or if fsindex != 0. - */ - return task->thread.gsbase; + if (task->thread.gsindex == 0) + return task->thread.gsbase; + else + return task_seg_base(task, task->thread.gsindex); } #endif } @@ -653,7 +696,8 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) unsigned long val = 0; if (n < HBP_NUM) { - struct perf_event *bp = thread->ptrace_bps[n]; + int index = array_index_nospec(n, HBP_NUM); + struct perf_event *bp = thread->ptrace_bps[index]; if (bp) val = bp->hw.info.address; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 2126b9d27c34..c663d5fcff2e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -81,6 +81,19 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +/* + * Some machines don't handle the default ACPI reboot method and + * require the EFI reboot method: + */ +static int __init set_efi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) { + reboot_type = BOOT_EFI; + pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident); + } + return 0; +} + void __noreturn machine_real_restart(unsigned int type) { local_irq_disable(); @@ -166,6 +179,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, + { /* Handle reboot issue on Acer TravelMate X514-51T */ + .callback = set_efi_reboot, + .ident = "Acer TravelMate X514-51T", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"), + }, + }, /* Apple */ { /* Handle problems with rebooting on Apple MacBook5 */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4cdc0b27ec82..01741834fd6a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -131,16 +131,6 @@ static int restore_sigcontext(struct pt_regs *regs, COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#ifdef CONFIG_X86_64 - /* - * Fix up SS if needed for the benefit of old DOSEMU and - * CRIU. - */ - if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && - user_64bit_mode(regs))) - force_valid_ss(regs); -#endif - get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); regs->orig_ax = -1; /* disable syscall checks */ @@ -149,6 +139,15 @@ static int restore_sigcontext(struct pt_regs *regs, buf = (void __user *)buf_val; } get_user_catch(err); +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); +#endif + err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); force_iret(); @@ -460,6 +459,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, { struct rt_sigframe __user *frame; void __user *fp = NULL; + unsigned long uc_flags; int err = 0; frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); @@ -472,9 +472,11 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); save_altstack_ex(&frame->uc.uc_stack, regs->sp); @@ -540,6 +542,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, { #ifdef CONFIG_X86_X32_ABI struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; void __user *restorer; int err = 0; void __user *fpstate = NULL; @@ -554,9 +557,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 04adc8d60aed..b2b87b91f336 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -181,6 +181,12 @@ asmlinkage __visible void smp_reboot_interrupt(void) irq_exit(); } +static int register_stop_handler(void) +{ + return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop"); +} + static void native_stop_other_cpus(int wait) { unsigned long flags; @@ -214,39 +220,41 @@ static void native_stop_other_cpus(int wait) apic->send_IPI_allbutself(REBOOT_VECTOR); /* - * Don't wait longer than a second if the caller - * didn't ask us to wait. + * Don't wait longer than a second for IPI completion. The + * wait request is not checked here because that would + * prevent an NMI shutdown attempt in case that not all + * CPUs reach shutdown state. */ timeout = USEC_PER_SEC; - while (num_online_cpus() > 1 && (wait || timeout--)) + while (num_online_cpus() > 1 && timeout--) udelay(1); } - - /* if the REBOOT_VECTOR didn't work, try with the NMI */ - if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) { - if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, - NMI_FLAG_FIRST, "smp_stop")) - /* Note: we ignore failures here */ - /* Hope the REBOOT_IRQ is good enough */ - goto finish; - - /* sync above data before sending IRQ */ - wmb(); - pr_emerg("Shutting down cpus with NMI\n"); + /* if the REBOOT_VECTOR didn't work, try with the NMI */ + if (num_online_cpus() > 1) { + /* + * If NMI IPI is enabled, try to register the stop handler + * and send the IPI. In any case try to wait for the other + * CPUs to stop. + */ + if (!smp_no_nmi_ipi && !register_stop_handler()) { + /* Sync above data before sending IRQ */ + wmb(); - apic->send_IPI_allbutself(NMI_VECTOR); + pr_emerg("Shutting down cpus with NMI\n"); + apic->send_IPI_allbutself(NMI_VECTOR); + } /* - * Don't wait longer than a 10 ms if the caller - * didn't ask us to wait. + * Don't wait longer than 10 ms if the caller didn't + * reqeust it. If wait is true, the machine hangs here if + * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } -finish: local_irq_save(flags); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 623965e86b65..897da526e40e 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -231,9 +231,55 @@ static const struct dmi_system_id efifb_dmi_system_table[] __initconst = { {}, }; +/* + * Some devices have a portrait LCD but advertise a landscape resolution (and + * pitch). We simply swap width and height for these devices so that we can + * correctly deal with some of them coming with multiple resolutions. + */ +static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = { + { + /* + * Lenovo MIIX310-10ICR, only some batches have the troublesome + * 800x1280 portrait screen. Luckily the portrait version has + * its own BIOS version, so we match on that. + */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "MIIX 310-10ICR"), + DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1HCN44WW"), + }, + }, + { + /* Lenovo MIIX 320-10ICR with 800x1280 portrait screen */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, + "Lenovo MIIX 320-10ICR"), + }, + }, + { + /* Lenovo D330 with 800x1280 or 1200x1920 portrait screen */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, + "Lenovo ideapad D330-10IGM"), + }, + }, + {}, +}; + __init void sysfb_apply_efi_quirks(void) { if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS)) dmi_check_system(efifb_dmi_system_table); + + if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI && + dmi_check_system(efifb_dmi_swap_width_height)) { + u16 temp = screen_info.lfb_width; + + screen_info.lfb_width = screen_info.lfb_height; + screen_info.lfb_height = temp; + screen_info.lfb_linelength = 4 * screen_info.lfb_width; + } } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index a5b802a12212..71d3fef1edc9 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -5,6 +5,7 @@ #include <linux/user.h> #include <linux/regset.h> #include <linux/syscalls.h> +#include <linux/nospec.h> #include <linux/uaccess.h> #include <asm/desc.h> @@ -220,6 +221,7 @@ int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *u_info) { struct user_desc info; + int index; if (idx == -1 && get_user(idx, &u_info->entry_number)) return -EFAULT; @@ -227,8 +229,11 @@ int do_get_thread_area(struct task_struct *p, int idx, if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return -EINVAL; - fill_user_desc(&info, idx, - &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]); + index = idx - GDT_ENTRY_TLS_MIN; + index = array_index_nospec(index, + GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN + 1); + + fill_user_desc(&info, idx, &p->thread.tls_array[index]); if (copy_to_user(u_info, &info, sizeof(info))) return -EFAULT; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 36d02484e384..5d681fe6d352 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -620,7 +620,7 @@ unsigned long native_calibrate_tsc(void) case INTEL_FAM6_KABYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; - case INTEL_FAM6_ATOM_DENVERTON: + case INTEL_FAM6_ATOM_GOLDMONT_X: crystal_khz = 25000; /* 25.0 MHz */ break; case INTEL_FAM6_ATOM_GOLDMONT: diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 3dc26f95d46e..9b9fd4826e7a 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -320,10 +320,14 @@ bool unwind_next_frame(struct unwind_state *state) } /* Get the next frame pointer: */ - if (state->regs) + if (state->next_bp) { + next_bp = state->next_bp; + state->next_bp = NULL; + } else if (state->regs) { next_bp = (unsigned long *)state->regs->bp; - else + } else { next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp); + } /* Move to the next frame if it's safe: */ if (!update_stack_state(state, next_bp)) @@ -398,6 +402,21 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, bp = get_frame_pointer(task, regs); + /* + * If we crash with IP==0, the last successfully executed instruction + * was probably an indirect function call with a NULL function pointer. + * That means that SP points into the middle of an incomplete frame: + * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we + * would have written a frame pointer if we hadn't crashed. + * Pretend that the frame is complete and that BP points to it, but save + * the real BP so that we can use it when looking for the next frame. + */ + if (regs && regs->ip == 0 && + (unsigned long *)kernel_stack_pointer(regs) >= first_frame) { + state->next_bp = bp; + bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1; + } + /* Initialize stack info and make sure the frame data is accessible: */ get_stack_info(bp, state->task, &state->stack_info, &state->stack_mask); @@ -410,7 +429,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->bp < first_frame)) + (state->next_bp == NULL && state->bp < first_frame))) unwind_next_frame(state); } EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index be86a865087a..3bbb399f7ead 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -74,11 +74,28 @@ static struct orc_entry *orc_module_find(unsigned long ip) } #endif +/* + * If we crash with IP==0, the last successfully executed instruction + * was probably an indirect function call with a NULL function pointer, + * and we don't have unwind information for NULL. + * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function + * pointer into its parent and then continue normally from there. + */ +static struct orc_entry null_orc_entry = { + .sp_offset = sizeof(long), + .sp_reg = ORC_REG_SP, + .bp_reg = ORC_REG_UNDEFINED, + .type = ORC_TYPE_CALL +}; + static struct orc_entry *orc_find(unsigned long ip) { if (!orc_init) return NULL; + if (ip == 0) + return &null_orc_entry; + /* For non-init vmlinux addresses, use the fast lookup table: */ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { unsigned int idx, start, stop; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index b9a8f34b5e5a..73391c1bd2a9 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -518,9 +518,12 @@ struct uprobe_xol_ops { void (*abort)(struct arch_uprobe *, struct pt_regs *); }; -static inline int sizeof_long(void) +static inline int sizeof_long(struct pt_regs *regs) { - return in_ia32_syscall() ? 4 : 8; + /* + * Check registers for mode as in_xxx_syscall() does not apply here. + */ + return user_64bit_mode(regs) ? 8 : 4; } static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) @@ -531,9 +534,9 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) static int push_ret_address(struct pt_regs *regs, unsigned long ip) { - unsigned long new_sp = regs->sp - sizeof_long(); + unsigned long new_sp = regs->sp - sizeof_long(regs); - if (copy_to_user((void __user *)new_sp, &ip, sizeof_long())) + if (copy_to_user((void __user *)new_sp, &ip, sizeof_long(regs))) return -EFAULT; regs->sp = new_sp; @@ -566,7 +569,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs long correction = utask->vaddr - utask->xol_vaddr; regs->ip += correction; } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { - regs->sp += sizeof_long(); /* Pop incorrect return address */ + regs->sp += sizeof_long(regs); /* Pop incorrect return address */ if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen)) return -ERESTART; } @@ -675,7 +678,7 @@ static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) * "call" insn was executed out-of-line. Just restore ->sp and restart. * We could also restore ->ip and try to call branch_emulate_op() again. */ - regs->sp += sizeof_long(); + regs->sp += sizeof_long(regs); return -ERESTART; } @@ -966,7 +969,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { - int rasize = sizeof_long(), nleft; + int rasize = sizeof_long(regs), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) @@ -984,7 +987,7 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); - force_sig_info(SIGSEGV, SEND_SIG_FORCED, current); + force_sig(SIGSEGV, current); } return -1; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b854ebf5851b..2384a2ae5ec3 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -390,7 +390,7 @@ SECTIONS * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. */ -#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load +#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bbcd69c76d96..38959b173a42 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -368,7 +368,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO); + F(AMD_SSB_NO) | F(AMD_STIBP); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = @@ -396,14 +396,15 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); + F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | + F(MD_CLEAR); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); r = -E2BIG; - if (*nent >= maxnent) + if (WARN_ON(*nent >= maxnent)) goto out; do_cpuid_1_ent(entry, function, index); @@ -480,8 +481,16 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; cpuid_mask(&entry->edx, CPUID_7_EDX); + if (boot_cpu_has(X86_FEATURE_IBPB) && + boot_cpu_has(X86_FEATURE_IBRS)) + entry->edx |= F(SPEC_CTRL); + if (boot_cpu_has(X86_FEATURE_STIBP)) + entry->edx |= F(INTEL_STIBP); + if (boot_cpu_has(X86_FEATURE_SSBD)) + entry->edx |= F(SPEC_CTRL_SSBD); /* * We emulate ARCH_CAPABILITIES in software even * if the host doesn't support it. @@ -698,6 +707,9 @@ out: static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, u32 idx, int *nent, int maxnent, unsigned int type) { + if (*nent >= maxnent) + return -E2BIG; + if (type == KVM_GET_EMULATED_CPUID) return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 9a327d5b6d1f..d78a61408243 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -47,8 +47,6 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, - [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX}, - [CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX}, [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX}, [CPUID_6_EAX] = { 6, 0, CPUID_EAX}, [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5f758568fc44..eb8b843325f4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2588,15 +2588,13 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU * supports long mode. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); if (emulator_has_longmode(ctxt)) { struct desc_struct cs_desc; /* Zero CR4.PCIDE before CR0.PG. */ - if (cr4 & X86_CR4_PCIDE) { + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PCIDE) ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - cr4 &= ~X86_CR4_PCIDE; - } /* A 32-bit code segment is required to clear EFER.LMA. */ memset(&cs_desc, 0, sizeof(cs_desc)); @@ -2610,13 +2608,16 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) if (cr0 & X86_CR0_PE) ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ - if (cr4 & X86_CR4_PAE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + if (emulator_has_longmode(ctxt)) { + /* Clear CR4.PAE before clearing EFER.LME. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PAE) + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); - /* And finally go back to 32-bit mode. */ - efer = 0; - ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + /* And finally go back to 32-bit mode. */ + efer = 0; + ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + } smbase = ctxt->ops->get_smbase(ctxt); if (emulator_has_longmode(ctxt)) @@ -5297,6 +5298,8 @@ done_prefixes: ctxt->memopp->addr.mem.ea + ctxt->_eip); done: + if (rc == X86EMUL_PROPAGATE_FAULT) + ctxt->have_exception = true; return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f7c34184342a..2307f63efd20 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -133,6 +133,7 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, if (offset <= max_apic_id) { u8 cluster_size = min(max_apic_id - offset + 1, 16U); + offset = array_index_nospec(offset, map->max_apic_id + 1); *cluster = &map->phys_map[offset]; *mask = dest_id & (0xffff >> (16 - cluster_size)); } else { @@ -208,6 +209,9 @@ static void recalculate_apic_map(struct kvm *kvm) if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) new->phys_map[xapic_id] = apic; + if (!kvm_apic_sw_enabled(apic)) + continue; + ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) { @@ -251,6 +255,8 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) recalculate_apic_map(apic->vcpu->kvm); } else static_key_slow_inc(&apic_sw_disabled.key); + + recalculate_apic_map(apic->vcpu->kvm); } } @@ -829,7 +835,8 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, if (irq->dest_id > map->max_apic_id) { *bitmap = 0; } else { - *dst = &map->phys_map[irq->dest_id]; + u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1); + *dst = &map->phys_map[dest_id]; *bitmap = 1; } return true; @@ -1960,13 +1967,11 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } } - if ((old_value ^ value) & X2APIC_ENABLE) { - if (value & X2APIC_ENABLE) { - kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); - kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); - } else - kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); - } + if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) + kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); + + if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) + kvm_x86_ops->set_virtual_apic_mode(vcpu); apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2159,7 +2164,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u32 ppr; - if (!apic_enabled(apic)) + if (!kvm_apic_hw_enabled(apic)) return -1; __apic_update_ppr(apic, &ppr); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 4b9935a38347..bc3446d3cfdf 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -16,6 +16,13 @@ #define APIC_BUS_CYCLE_NS 1 #define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) +enum lapic_mode { + LAPIC_MODE_DISABLED = 0, + LAPIC_MODE_INVALID = X2APIC_ENABLE, + LAPIC_MODE_XAPIC = MSR_IA32_APICBASE_ENABLE, + LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE, +}; + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ @@ -89,6 +96,7 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); +enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); @@ -220,4 +228,10 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); + +static inline enum lapic_mode kvm_apic_mode(u64 apic_base) +{ + return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); +} + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 364d9895dd56..c0b0135ef07f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -40,6 +40,7 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> +#include <linux/kthread.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -48,6 +49,30 @@ #include <asm/kvm_page_track.h> #include "trace.h" +extern bool itlb_multihit_kvm_mitigation; + +static int __read_mostly nx_huge_pages = -1; +static uint __read_mostly nx_huge_pages_recovery_ratio = 60; + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); + +static struct kernel_param_ops nx_huge_pages_ops = { + .set = set_nx_huge_pages, + .get = param_get_bool, +}; + +static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { + .set = set_nx_huge_pages_recovery_ratio, + .get = param_get_uint, +}; + +module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); +__MODULE_PARM_TYPE(nx_huge_pages, "bool"); +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, + &nx_huge_pages_recovery_ratio, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -139,9 +164,6 @@ module_param(dbg, bool, 0644); #include <trace/events/kvm.h> -#define CREATE_TRACE_POINTS -#include "mmutrace.h" - #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) @@ -244,6 +266,11 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); +static bool is_executable_pte(u64 spte); + +#define CREATE_TRACE_POINTS +#include "mmutrace.h" + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { @@ -264,6 +291,11 @@ static inline bool spte_ad_enabled(u64 spte) return !(spte & shadow_acc_track_value); } +static bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + static inline u64 spte_shadow_accessed_mask(u64 spte) { MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); @@ -1008,10 +1040,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) { - if (sp->role.direct) - BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); - else + if (!sp->role.direct) { sp->gfns[index] = gfn; + return; + } + + if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) + pr_err_ratelimited("gfn mismatch under direct page %llx " + "(expected %llx, got %llx)\n", + sp->gfn, + kvm_mmu_page_get_gfn(sp, index), gfn); } /* @@ -1070,6 +1108,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } +static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + if (sp->lpage_disallowed) + return; + + ++kvm->stat.nx_lpage_splits; + list_add_tail(&sp->lpage_disallowed_link, + &kvm->arch.lpage_disallowed_mmu_pages); + sp->lpage_disallowed = true; +} + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -1087,6 +1136,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } +static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + --kvm->stat.nx_lpage_splits; + sp->lpage_disallowed = false; + list_del(&sp->lpage_disallowed_link); +} + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -2634,6 +2690,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_reload_remote_mmus(kvm); } + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + sp->role.invalid = 1; return ret; } @@ -2788,6 +2847,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= spte_shadow_accessed_mask(spte); + if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else @@ -2903,10 +2967,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, ret = RET_PF_EMULATE; pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", - is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, - *sptep, sptep); + trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -2918,8 +2979,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, } } - kvm_release_pfn_clean(pfn); - return ret; } @@ -2954,9 +3013,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (ret <= 0) return -1; - for (i = 0; i < ret; i++, gfn++, start++) + for (i = 0; i < ret; i++, gfn++, start++) { mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); + put_page(pages[i]); + } return 0; } @@ -3004,40 +3065,71 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, - int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) +static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { - struct kvm_shadow_walk_iterator iterator; + int level = *levelp; + u64 spte = *it.sptep; + + if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + is_nx_huge_page_enabled() && + is_shadow_present_pte(spte) && + !is_large_pte(spte)) { + /* + * A small SPTE exists for this pfn, but FNAME(fetch) + * and __direct_map would like to create a large PTE + * instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of + * the address. + */ + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + *pfnp |= gfn & page_mask; + (*levelp)--; + } +} + +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, + int map_writable, int level, kvm_pfn_t pfn, + bool prefault, bool lpage_disallowed) +{ + struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int emulate = 0; - gfn_t pseudo_gfn; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + gfn_t base_gfn = gfn; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return 0; + return RET_PF_RETRY; - for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == level) { - emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, level, gfn, pfn, prefault, - map_writable); - direct_pte_prefetch(vcpu, iterator.sptep); - ++vcpu->stat.pf_fixed; - break; - } + trace_kvm_mmu_spte_requested(gpa, level, pfn); + for_each_shadow_entry(vcpu, gpa, it) { + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &level); - drop_large_spte(vcpu, iterator.sptep); - if (!is_shadow_present_pte(*iterator.sptep)) { - u64 base_addr = iterator.addr; + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == level) + break; - base_addr &= PT64_LVL_ADDR_MASK(iterator.level); - pseudo_gfn = base_addr >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, - iterator.level - 1, 1, ACC_ALL); + drop_large_spte(vcpu, it.sptep); + if (!is_shadow_present_pte(*it.sptep)) { + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); - link_shadow_page(vcpu, iterator.sptep, sp); + link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } - return emulate; + + ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, + write, level, base_gfn, pfn, prefault, + map_writable); + direct_pte_prefetch(vcpu, it.sptep); + ++vcpu->stat.pf_fixed; + return ret; } static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) @@ -3072,11 +3164,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, kvm_pfn_t *pfnp, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *gfnp; int level = *levelp; /* @@ -3086,7 +3177,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; @@ -3103,8 +3194,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, mask = KVM_PAGES_PER_HPAGE(level) - 1; VM_BUG_ON((gfn & mask) != (pfn & mask)); if (pfn & mask) { - gfn &= ~mask; - *gfnp = gfn; kvm_release_pfn_clean(pfn); pfn &= ~mask; kvm_get_pfn(pfn); @@ -3331,11 +3420,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level = false; + bool force_pt_level; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + force_pt_level = lpage_disallowed; level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { /* @@ -3361,22 +3453,20 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; - + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, + prefault, false); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } @@ -3922,6 +4012,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; bool map_writable; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -3932,8 +4024,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, - PT_DIRECTORY_LEVEL); + force_pt_level = + lpage_disallowed || + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { if (level > PT_DIRECTORY_LEVEL && @@ -3954,22 +4047,20 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; - + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, lpage_disallowed); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, @@ -4313,11 +4404,11 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, */ /* Faults from writes to non-writable pages */ - u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0; + u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; /* Faults from user mode accesses to supervisor pages */ - u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0; + u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; /* Faults from fetches of non-executable pages*/ - u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0; + u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; /* Faults from kernel mode fetches of user pages */ u8 smepf = 0; /* Faults from kernel mode accesses of user pages */ @@ -5253,9 +5344,9 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && + PageTransCompoundMap(pfn_to_page(pfn))) { drop_spte(kvm, sptep); need_tlb_flush = 1; goto restart; @@ -5418,13 +5509,30 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { + gen &= MMIO_GEN_MASK; + + /* + * Shift to eliminate the "update in-progress" flag, which isn't + * included in the spte's generation number. + */ + gen >>= 1; + /* - * The very rare case: if the generation-number is round, + * Generation numbers are incremented in multiples of the number of + * address spaces in order to provide unique generations across all + * address spaces. Strip what is effectively the address space + * modifier prior to checking for a wrap of the MMIO generation so + * that a wrap in any address space is detected. + */ + gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); + + /* + * The very rare case: if the MMIO generation number has wrapped, * zap all shadow pages. */ - if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { + if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } @@ -5437,7 +5545,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; unsigned long freed = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; @@ -5487,7 +5595,7 @@ unlock: break; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return freed; } @@ -5511,8 +5619,58 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static bool get_nx_auto_mode(void) +{ + /* Return true when CPU has the bug, and mitigations are ON */ + return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); +} + +static void __set_nx_huge_pages(bool val) +{ + nx_huge_pages = itlb_multihit_kvm_mitigation = val; +} + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) +{ + bool old_val = nx_huge_pages; + bool new_val; + + /* In "auto" mode deploy workaround only if CPU has the bug. */ + if (sysfs_streq(val, "off")) + new_val = 0; + else if (sysfs_streq(val, "force")) + new_val = 1; + else if (sysfs_streq(val, "auto")) + new_val = get_nx_auto_mode(); + else if (strtobool(val, &new_val) < 0) + return -EINVAL; + + __set_nx_huge_pages(new_val); + + if (new_val != old_val) { + struct kvm *kvm; + int idx; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + idx = srcu_read_lock(&kvm->srcu); + kvm_mmu_invalidate_zap_all_pages(kvm); + srcu_read_unlock(&kvm->srcu, idx); + + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + } + mutex_unlock(&kvm_lock); + } + + return 0; +} + int kvm_mmu_module_init(void) { + if (nx_huge_pages == -1) + __set_nx_huge_pages(get_nx_auto_mode()); + kvm_mmu_reset_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", @@ -5578,3 +5736,116 @@ void kvm_mmu_module_exit(void) unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); } + +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +{ + unsigned int old_val; + int err; + + old_val = nx_huge_pages_recovery_ratio; + err = param_set_uint(val, kp); + if (err) + return err; + + if (READ_ONCE(nx_huge_pages) && + !old_val && nx_huge_pages_recovery_ratio) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + + mutex_unlock(&kvm_lock); + } + + return err; +} + +static void kvm_recover_nx_lpages(struct kvm *kvm) +{ + int rcu_idx; + struct kvm_mmu_page *sp; + unsigned int ratio; + LIST_HEAD(invalid_list); + ulong to_zap; + + rcu_idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + /* + * We use a separate list instead of just using active_mmu_pages + * because the number of lpage_disallowed pages is expected to + * be relatively small compared to the total. + */ + sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + struct kvm_mmu_page, + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + + if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + if (to_zap) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, rcu_idx); +} + +static long get_nx_lpage_recovery_timeout(u64 start_time) +{ + return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) + ? start_time + 60 * HZ - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; +} + +static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +{ + u64 start_time; + long remaining_time; + + while (true) { + start_time = get_jiffies_64(); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop() && remaining_time > 0) { + schedule_timeout(remaining_time); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + + if (kthread_should_stop()) + return 0; + + kvm_recover_nx_lpages(kvm); + } +} + +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + int err; + + err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + "kvm-nx-lpage-recovery", + &kvm->arch.nx_lpage_recovery_thread); + if (!err) + kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + + return err; +} + +void kvm_mmu_pre_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.nx_lpage_recovery_thread) + kthread_stop(kvm->arch.nx_lpage_recovery_thread); +} diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index efc857615d8e..068feab64acf 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -195,4 +195,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); + +int kvm_mmu_post_init_vm(struct kvm *kvm); +void kvm_mmu_pre_destroy_vm(struct kvm *kvm); + #endif diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index c73bf4e4988c..918b0d5bf272 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -325,6 +325,65 @@ TRACE_EVENT( __entry->kvm_gen == __entry->spte_gen ) ); + +TRACE_EVENT( + kvm_mmu_set_spte, + TP_PROTO(int level, gfn_t gfn, u64 *sptep), + TP_ARGS(level, gfn, sptep), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, spte) + __field(u64, sptep) + __field(u8, level) + /* These depend on page entry type, so compute them now. */ + __field(bool, r) + __field(bool, x) + __field(u8, u) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->spte = *sptep; + __entry->sptep = virt_to_phys(sptep); + __entry->level = level; + __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); + __entry->x = is_executable_pte(__entry->spte); + __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; + ), + + TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", + __entry->gfn, __entry->spte, + __entry->r ? "r" : "-", + __entry->spte & PT_WRITABLE_MASK ? "w" : "-", + __entry->x ? "x" : "-", + __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), + __entry->level, __entry->sptep + ) +); + +TRACE_EVENT( + kvm_mmu_spte_requested, + TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), + TP_ARGS(addr, level, pfn), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, pfn) + __field(u8, level) + ), + + TP_fast_assign( + __entry->gfn = addr >> PAGE_SHIFT; + __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); + __entry->level = level; + ), + + TP_printk("gfn %llx pfn %llx level %d", + __entry->gfn, __entry->pfn, __entry->level + ) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6288e9d7068e..8cf7a09bdd73 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -522,6 +522,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); + kvm_release_pfn_clean(pfn); return true; } @@ -595,12 +596,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - kvm_pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable, bool prefault, + bool lpage_disallowed) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, ret; + gfn_t gfn, base_gfn; direct_access = gw->pte_access; @@ -645,35 +648,48 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - for (; - shadow_walk_okay(&it) && it.level > hlevel; - shadow_walk_next(&it)) { - gfn_t direct_gfn; + /* + * FNAME(page_fault) might have clobbered the bottom bits of + * gw->gfn, restore them from the virtual address. + */ + gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); + base_gfn = gfn; + trace_kvm_mmu_spte_requested(addr, gw->level, pfn); + + for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); - validate_direct_spte(vcpu, it.sptep, direct_access); - drop_large_spte(vcpu, it.sptep); + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); - if (is_shadow_present_pte(*it.sptep)) - continue; + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == hlevel) + break; + + validate_direct_spte(vcpu, it.sptep, direct_access); - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + drop_large_spte(vcpu, it.sptep); - sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, - true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); + if (!is_shadow_present_pte(*it.sptep)) { + sp = kvm_mmu_get_page(vcpu, base_gfn, addr, + it.level - 1, true, direct_access); + link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); + } } - clear_sp_write_flooding_count(it.sptep); ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, gw->gfn, pfn, prefault, map_writable); + it.level, base_gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - + ++vcpu->stat.pf_fixed; return ret; out_gpte_changed: - kvm_release_pfn_clean(pfn); return RET_PF_RETRY; } @@ -740,9 +756,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + bool force_pt_level = lpage_disallowed; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -821,6 +839,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, walker.pte_access &= ~ACC_EXEC_MASK; } + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; @@ -829,19 +848,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (!force_pt_level) - transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); + transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault); - ++vcpu->stat.pf_fixed; + level, pfn, map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 026db42a86c3..1bca8016ee8a 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -131,8 +131,8 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, intr ? kvm_perf_overflow_intr : kvm_perf_overflow, pmc); if (IS_ERR(event)) { - printk_once("kvm_pmu: event creation failed %ld\n", - PTR_ERR(event)); + pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", + PTR_ERR(event), pmc->idx); return; } diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index 5ab4a364348e..2729131fe9bf 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -235,11 +235,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, msr))) { - if (!msr_info->host_initiated) - data = (s64)(s32)data; - pmc->counter += data - pmc_read_counter(pmc); + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { + if (msr_info->host_initiated) + pmc->counter = data; + else + pmc->counter = (s32)data; + return 0; + } else if ((pmc = get_fixed_pmc(pmu, msr))) { + pmc->counter = data; return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4dc79d139810..52edb8cf1c40 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -299,7 +299,7 @@ static int vgif = true; module_param(vgif, int, 0444); static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -static void svm_flush_tlb(struct kvm_vcpu *vcpu); +static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); static void svm_complete_interrupts(struct vcpu_svm *svm); static int nested_svm_exit_handled(struct vcpu_svm *svm); @@ -608,8 +608,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; - if (!npt_enabled && !(efer & EFER_LMA)) - efer &= ~EFER_LME; + + if (!npt_enabled) { + /* Shadow paging assumes NX to be available. */ + efer |= EFER_NX; + + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; + } to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); @@ -1567,7 +1573,11 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (!kvm_vcpu_apicv_active(vcpu)) return; - if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) + /* + * Since the host physical APIC id is 8 bits, + * we can support host APIC ID upto 255. + */ + if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); @@ -2093,7 +2103,7 @@ static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); vcpu->arch.cr4 = cr4; if (!npt_enabled) @@ -2211,6 +2221,7 @@ static int pf_interception(struct vcpu_svm *svm) static int db_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; + struct kvm_vcpu *vcpu = &svm->vcpu; if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && @@ -2221,6 +2232,8 @@ static int db_interception(struct vcpu_svm *svm) if (svm->nmi_singlestep) { disable_nmi_singlestep(svm); + /* Make sure we check for pending NMIs upon entry */ + kvm_make_request(KVM_REQ_EVENT, vcpu); } if (svm->vcpu.guest_debug & @@ -2431,7 +2444,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); } static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, @@ -2929,6 +2942,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); + /* + * Drop what we picked up for L2 via svm_complete_interrupts() so it + * doesn't end up in L1. + */ + svm->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + return 0; } @@ -3096,7 +3117,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; svm->nested.intercept = nested_vmcb->control.intercept; - svm_flush_tlb(&svm->vcpu); + svm_flush_tlb(&svm->vcpu, true); svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; @@ -4574,7 +4595,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } -static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { return; } @@ -4622,6 +4643,11 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } +static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return false; +} + static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; @@ -4927,7 +4953,7 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } -static void svm_flush_tlb(struct kvm_vcpu *vcpu) +static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5268,7 +5294,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); } static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) @@ -5282,7 +5308,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); } static int is_disabled(void) @@ -5319,6 +5345,13 @@ static bool svm_cpu_has_accelerated_tpr(void) static bool svm_has_emulated_msr(int index) { + switch (index) { + case MSR_IA32_MCG_EXT_CTL: + return false; + default: + break; + } + return true; } @@ -5686,7 +5719,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, - .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, + .set_virtual_apic_mode = svm_set_virtual_apic_mode, .get_enable_apicv = svm_get_enable_apicv, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, @@ -5724,6 +5757,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, + .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, }; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 9807c314c478..3bf41413ab15 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -438,13 +438,13 @@ TRACE_EVENT(kvm_apic_ipi, ); TRACE_EVENT(kvm_apic_accept_irq, - TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec), + TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec), TP_ARGS(apicid, dm, tm, vec), TP_STRUCT__entry( __field( __u32, apicid ) __field( __u16, dm ) - __field( __u8, tm ) + __field( __u16, tm ) __field( __u8, vec ) ), diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 16bb8e35605e..c579cda1721e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -27,6 +27,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/sched/smt.h> #include <linux/moduleparam.h> #include <linux/mod_devicetable.h> #include <linux/trace_events.h> @@ -590,7 +591,8 @@ struct nested_vmx { */ bool sync_shadow_vmcs; - bool change_vmcs01_virtual_x2apic_mode; + bool change_vmcs01_virtual_apic_mode; + /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; @@ -739,7 +741,6 @@ struct vcpu_vmx { u64 msr_guest_kernel_gs_base; #endif - u64 arch_capabilities; u64 spec_ctrl; u32 vm_entry_controls_shadow; @@ -1601,7 +1602,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) return -1; } -static inline void __invvpid(int ext, u16 vpid, gva_t gva) +static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) { struct { u64 vpid : 16; @@ -1615,7 +1616,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) : : "a"(&operand), "c"(ext) : "cc", "memory"); } -static inline void __invept(int ext, u64 eptp, gpa_t gpa) +static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) { struct { u64 eptp, gpa; @@ -2229,7 +2230,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, if (!entry_only) j = find_msr(&m->host, msr); - if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) { + if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) || + (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; @@ -2257,17 +2259,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; - if (!enable_ept) { - /* - * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing - * host CPUID is more efficient than testing guest CPUID - * or CR4. Host SMEP is anyway a requirement for guest SMEP. - */ - if (boot_cpu_has(X86_FEATURE_SMEP)) - guest_efer |= EFER_NX; - else if (!(guest_efer & EFER_NX)) - ignore_bits |= EFER_NX; - } + /* Shadow paging assumes NX to be available. */ + if (!enable_ept) + guest_efer |= EFER_NX; /* * LMA and LME handled by hardware; SCE meaningless outside long mode. @@ -2824,9 +2818,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) - move_msr_up(vmx, index, save_nmsrs++); /* * MSR_STAR is only needed on long mode guests, and only * if efer.sce is enabled. @@ -2839,6 +2830,9 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_EFER); if (index >= 0 && update_transition_efer(vmx, index)) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) + move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; @@ -3491,12 +3485,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = to_vmx(vcpu)->spec_ctrl; break; - case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) - return 1; - msr_info->data = to_vmx(vcpu)->arch_capabilities; - break; case MSR_IA32_SYSENTER_CS: msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); break; @@ -3661,11 +3649,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, MSR_TYPE_W); break; - case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; - vmx->arch_capabilities = data; - break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) @@ -4437,9 +4420,10 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, + bool invalidate_gpa) { - if (enable_ept) { + if (enable_ept && (invalidate_gpa || !enable_vpid)) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); @@ -4448,15 +4432,9 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) } } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); -} - -static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) -{ - if (enable_ept) - vmx_flush_tlb(vcpu); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) @@ -4490,7 +4468,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_dirty)) return; - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (is_pae_paging(vcpu)) { vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); @@ -4502,7 +4480,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (is_pae_paging(vcpu)) { mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); @@ -4654,7 +4632,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) ept_load_pdptrs(vcpu); } - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); vmcs_writel(GUEST_CR3, guest_cr3); } @@ -5927,8 +5905,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - vmx->arch_capabilities = kvm_get_arch_capabilities(); - vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); /* 22.2.1, 20.8.1 */ @@ -7444,25 +7420,50 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ off = exit_qualification; /* holds the displacement */ + if (addr_size == 1) + off = (gva_t)sign_extend64(off, 31); + else if (addr_size == 0) + off = (gva_t)sign_extend64(off, 15); if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) off += kvm_register_read(vcpu, index_reg)<<scaling; vmx_get_segment(vcpu, &s, seg_reg); - *ret = s.base + off; + /* + * The effective address, i.e. @off, of a memory operand is truncated + * based on the address size of the instruction. Note that this is + * the *effective address*, i.e. the address prior to accounting for + * the segment's base. + */ if (addr_size == 1) /* 32 bit */ - *ret &= 0xffffffff; + off &= 0xffffffff; + else if (addr_size == 0) /* 16 bit */ + off &= 0xffff; /* Checks for #GP/#SS exceptions. */ exn = false; if (is_long_mode(vcpu)) { + /* + * The virtual/linear address is never truncated in 64-bit + * mode, e.g. a 32-bit address size can yield a 64-bit virtual + * address when using FS/GS with a non-zero base. + */ + *ret = s.base + off; + /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory * destination for long mode! */ exn = is_noncanonical_address(*ret, vcpu); } else if (is_protmode(vcpu)) { + /* + * When not in long mode, the virtual/linear address is + * unconditionally truncated to 32 bits regardless of the + * address size. + */ + *ret = (s.base + off) & 0xffffffff; + /* Protected mode: apply checks for segment validity in the * following order: * - segment type check (#GP(0) may be thrown) @@ -7486,10 +7487,16 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); - /* Protected mode: #GP(0)/#SS(0) if the memory - * operand is outside the segment limit. + + /* + * Protected mode: #GP(0)/#SS(0) if the memory operand is + * outside the segment limit. All CPUs that support VMX ignore + * limit checks for flat segments, i.e. segments with base==0, + * limit==0xffffffff and of type expand-up data or code. */ - exn = exn || (off + sizeof(u64) > s.limit); + if (!(s.base == 0 && s.limit == 0xffffffff && + ((s.type & 8) || !(s.type & 4)))) + exn = exn || (off + sizeof(u64) > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, @@ -7675,6 +7682,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmx->nested.sync_shadow_vmcs = false; } static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) @@ -7686,7 +7694,6 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); - vmx->nested.sync_shadow_vmcs = false; vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; @@ -7708,6 +7715,7 @@ static void free_nested(struct vcpu_vmx *vmx) if (!vmx->nested.vmxon) return; + hrtimer_cancel(&vmx->nested.preemption_timer); vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); vmx->nested.posted_intr_nv = -1; @@ -7871,6 +7879,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) const unsigned long *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; + if (WARN_ON(!shadow_vmcs)) + return; + preempt_disable(); vmcs_load(shadow_vmcs); @@ -7918,6 +7929,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) u64 field_value = 0; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + if (WARN_ON(!shadow_vmcs)) + return; + vmcs_load(shadow_vmcs); for (q = 0; q < ARRAY_SIZE(fields); q++) { @@ -7970,6 +7984,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; + struct x86_exception e; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7997,8 +8012,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu) vmx_instruction_info, true, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - kvm_write_guest_virt_system(vcpu, gva, &field_value, - (is_long_mode(vcpu) ? 8 : 4), NULL); + if (kvm_write_guest_virt_system(vcpu, gva, &field_value, + (is_long_mode(vcpu) ? 8 : 4), + &e)) + kvm_inject_page_fault(vcpu, &e); } nested_vmx_succeed(vcpu); @@ -8285,7 +8302,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - __vmx_flush_tlb(vcpu, vmx->nested.vpid02); + __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); @@ -9266,31 +9283,43 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } -static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { u32 sec_exec_control; + if (!lapic_in_kernel(vcpu)) + return; + /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { - to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true; + to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; return; } - if (!cpu_has_vmx_virtualize_x2apic_mode()) - return; - if (!cpu_need_tpr_shadow(vcpu)) return; sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); - if (set) { - sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; - } else { - sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; - sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmx_flush_tlb_ept_only(vcpu); + switch (kvm_get_apic_mode(vcpu)) { + case LAPIC_MODE_INVALID: + WARN_ONCE(true, "Invalid local APIC state"); + case LAPIC_MODE_DISABLED: + break; + case LAPIC_MODE_XAPIC: + if (flexpriority_enabled) { + sec_exec_control |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmx_flush_tlb(vcpu, true); + } + break; + case LAPIC_MODE_X2APIC: + if (cpu_has_vmx_virtualize_x2apic_mode()) + sec_exec_control |= + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + break; } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); @@ -9318,7 +9347,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) !nested_cpu_has2(get_vmcs12(&vmx->vcpu), SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { vmcs_write64(APIC_ACCESS_ADDR, hpa); - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } } @@ -9405,6 +9434,11 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } +static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return pi_test_on(vcpu_to_pi_desc(vcpu)); +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -9746,8 +9780,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->__launched = vmx->loaded_vmcs->launched; + /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); + else if (static_branch_unlikely(&mds_user_clear)) + mds_clear_cpu_buffers(); asm( /* Store host registers */ @@ -10101,8 +10138,8 @@ free_vcpu: return ERR_PTR(err); } -#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" -#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" static int vmx_vm_init(struct kvm *kvm) { @@ -10119,7 +10156,7 @@ static int vmx_vm_init(struct kvm *kvm) * Warn upon starting the first VM in a potentially * insecure environment. */ - if (cpu_smt_control == CPU_SMT_ENABLED) + if (sched_smt_active()) pr_warn_once(L1TF_MSG_SMT); if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) pr_warn_once(L1TF_MSG_L1D); @@ -10869,8 +10906,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * If PAE paging and EPT are both on, CR3 is not used by the CPU and * must not be dereferenced. */ - if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && - !nested_ept) { + if (is_pae_paging(vcpu) && !nested_ept) { if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { *entry_failure_code = ENTRY_FAIL_PDPTE; return 1; @@ -11177,11 +11213,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true); } } else { vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } } @@ -11205,7 +11241,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } } else if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } /* @@ -11826,24 +11862,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_clear_interrupt_queue(vcpu); } -static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - u32 entry_failure_code; - - nested_ept_uninit_mmu_context(vcpu); - - /* - * Only PDPTE load can fail as the value of cr3 was checked on entry and - * couldn't have changed. - */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) - nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); - - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; -} - /* * A part of what we need to when the nested L2 guest exits and we want to * run its L1 parent, is to reset L1's guest state to the host state specified @@ -11857,6 +11875,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct kvm_segment seg; + u32 entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -11883,7 +11902,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); vmx_set_cr4(vcpu, vmcs12->host_cr4); - load_vmcs12_mmu_host_state(vcpu, vmcs12); + nested_ept_uninit_mmu_context(vcpu); + + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); + + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; if (enable_vpid) { /* @@ -11891,7 +11920,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * L1's vpid. TODO: move to a more elaborate solution, giving * each L2 its own vpid and exposing the vpid feature to L1. */ - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } /* Restore posted intr vector. */ if (nested_cpu_has_posted_intr(vmcs12)) @@ -11974,6 +12003,140 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); } +static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) +{ + struct shared_msr_entry *efer_msr; + unsigned int i; + + if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) + return vmcs_read64(GUEST_IA32_EFER); + + if (cpu_has_load_ia32_efer) + return host_efer; + + for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { + if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) + return vmx->msr_autoload.guest.val[i].value; + } + + efer_msr = find_msr_entry(vmx, MSR_EFER); + if (efer_msr) + return efer_msr->data; + + return host_efer; +} + +static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_msr_entry g, h; + struct msr_data msr; + gpa_t gpa; + u32 i, j; + + vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); + + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + /* + * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set + * as vmcs01.GUEST_DR7 contains a userspace defined value + * and vcpu->arch.dr7 is not squirreled away before the + * nested VMENTER (not worth adding a variable in nested_vmx). + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + kvm_set_dr(vcpu, 7, DR7_FIXED_1); + else + WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); + } + + /* + * Note that calling vmx_set_{efer,cr0,cr4} is important as they + * handle a variety of side effects to KVM's software model. + */ + vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); + + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); + + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); + + nested_ept_uninit_mmu_context(vcpu); + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + /* + * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs + * from vmcs01 (if necessary). The PDPTRs are not loaded on + * VMFail, like everything else we just need to ensure our + * software model is up-to-date. + */ + ept_save_pdptrs(vcpu); + + kvm_mmu_reset_context(vcpu); + + if (cpu_has_vmx_msr_bitmap()) + vmx_update_msr_bitmap(vcpu); + + /* + * This nasty bit of open coding is a compromise between blindly + * loading L1's MSRs using the exit load lists (incorrect emulation + * of VMFail), leaving the nested VM's MSRs in the software model + * (incorrect behavior) and snapshotting the modified MSRs (too + * expensive since the lists are unbound by hardware). For each + * MSR that was (prematurely) loaded from the nested VMEntry load + * list, reload it from the exit load list if it exists and differs + * from the guest value. The intent is to stuff host state as + * silently as possible, not to fully process the exit load list. + */ + msr.host_initiated = false; + for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { + gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); + if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { + pr_debug_ratelimited( + "%s read MSR index failed (%u, 0x%08llx)\n", + __func__, i, gpa); + goto vmabort; + } + + for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { + gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); + if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { + pr_debug_ratelimited( + "%s read MSR failed (%u, 0x%08llx)\n", + __func__, j, gpa); + goto vmabort; + } + if (h.index != g.index) + continue; + if (h.value == g.value) + break; + + if (nested_vmx_load_msr_check(vcpu, &h)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, j, h.index, h.reserved); + goto vmabort; + } + + msr.index = h.index; + msr.data = h.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_debug_ratelimited( + "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", + __func__, j, h.index, h.value); + goto vmabort; + } + } + } + + return; + +vmabort: + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); +} + /* * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 * and modify vmcs12 to make it see what it would expect to see there if @@ -12026,14 +12189,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); - if (vmx->nested.change_vmcs01_virtual_x2apic_mode) { - vmx->nested.change_vmcs01_virtual_x2apic_mode = false; - vmx_set_virtual_x2apic_mode(vcpu, - vcpu->arch.apic_base & X2APIC_ENABLE); + if (vmx->nested.change_vmcs01_virtual_apic_mode) { + vmx->nested.change_vmcs01_virtual_apic_mode = false; + vmx_set_virtual_apic_mode(vcpu); } else if (!nested_cpu_has_ept(vmcs12) && nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } /* This is needed for same reason as it was needed in prepare_vmcs02 */ @@ -12106,7 +12268,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - load_vmcs12_mmu_host_state(vcpu, vmcs12); + /* + * Restore L1's host state to KVM's software model. We're here + * because a consistency check was caught by hardware, which + * means some amount of guest state has been propagated to KVM's + * model and needs to be unwound to the host's state. + */ + nested_vmx_restore_host_state(vcpu); /* * The emulated instruction was already skipped in @@ -12584,7 +12752,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, - .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, + .set_virtual_apic_mode = vmx_set_virtual_apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .get_enable_apicv = vmx_get_enable_apicv, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, @@ -12594,6 +12762,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .hwapic_isr_update = vmx_hwapic_isr_update, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 867c22f8d59b..8a51442247c5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,8 +90,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -191,7 +191,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, + { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } @@ -275,13 +276,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); int err; - if (((value ^ smsr->values[slot].curr) & mask) == 0) + value = (value & mask) | (smsr->values[slot].host & ~mask); + if (value == smsr->values[slot].curr) return 0; - smsr->values[slot].curr = value; err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); if (err) return 1; + smsr->values[slot].curr = value; if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -306,23 +308,27 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_apic_base); +enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) +{ + return kvm_apic_mode(kvm_get_apic_base(vcpu)); +} +EXPORT_SYMBOL_GPL(kvm_get_apic_mode); + int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - u64 old_state = vcpu->arch.apic_base & - (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); - u64 new_state = msr_info->data & - (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); + enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); + enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff | (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); - if ((msr_info->data & reserved_bits) || new_state == X2APIC_ENABLE) - return 1; - if (!msr_info->host_initiated && - ((new_state == MSR_IA32_APICBASE_ENABLE && - old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) || - (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) && - old_state == 0))) + if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) return 1; + if (!msr_info->host_initiated) { + if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) + return 1; + if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) + return 1; + } kvm_lapic_set_base(vcpu, msr_info->data); return 0; @@ -563,8 +569,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, data, offset, len, access); } +static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) +{ + return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | + rsvd_bits(1, 2); +} + /* - * Load the pae pdptrs. Return true is they are all valid. + * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { @@ -583,8 +595,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && - (pdpte[i] & - vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) { + (pdpte[i] & pdptr_rsvd_bits(vcpu))) { ret = 0; goto out; } @@ -610,7 +621,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) gfn_t gfn; int r; - if (is_long_mode(vcpu) || !is_pae(vcpu)) + if (!is_pae_paging(vcpu)) return false; if (!test_bit(VCPU_EXREG_PDPTR, @@ -839,8 +850,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu) && (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))) return 1; - else if (is_pae(vcpu) && is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + else if (is_pae_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; vcpu->arch.cr3 = cr3; @@ -1061,6 +1072,14 @@ u64 kvm_get_arch_capabilities(void) rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); /* + * If nx_huge_pages is enabled, KVM's shadow paging will ensure that + * the nested hypervisor runs with NX huge pages. If it is not, + * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 guests, so it need not worry about its own (L2) guests. + */ + data |= ARCH_CAP_PSCHANGE_MC_NO; + + /* * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us @@ -1072,8 +1091,40 @@ u64 kvm_get_arch_capabilities(void) if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + data |= ARCH_CAP_RDCL_NO; + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + data |= ARCH_CAP_SSB_NO; + if (!boot_cpu_has_bug(X86_BUG_MDS)) + data |= ARCH_CAP_MDS_NO; + + /* + * On TAA affected systems, export MDS_NO=0 when: + * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1. + * - Updated microcode is present. This is detected by + * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures + * that VERW clears CPU buffers. + * + * When MDS_NO=0 is exported, guests deploy clear CPU buffer + * mitigation and don't complain: + * + * "Vulnerable: Clear CPU buffers attempted, no microcode" + * + * If TSX is disabled on the system, guests are also mitigated against + * TAA and clear CPU buffer mitigation is not required for guests. + */ + if (!boot_cpu_has(X86_FEATURE_RTM)) + data &= ~ARCH_CAP_TAA_NO; + else if (!boot_cpu_has_bug(X86_BUG_TAA)) + data |= ARCH_CAP_TAA_NO; + else if (data & ARCH_CAP_TSX_CTRL_MSR) + data &= ~ARCH_CAP_MDS_NO; + + /* KVM does not emulate MSR_IA32_TSX_CTRL. */ + data &= ~ARCH_CAP_TSX_CTRL_MSR; return data; } + EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); static int kvm_get_msr_feature(struct kvm_msr_entry *msr) @@ -1107,31 +1158,42 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return 0; } -bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) +static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & efer_reserved_bits) - return false; - if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) - return false; + return false; if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - return false; + return false; return true; + +} +bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (efer & efer_reserved_bits) + return false; + + return __kvm_valid_efer(vcpu, efer); } EXPORT_SYMBOL_GPL(kvm_valid_efer); -static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; + u64 efer = msr_info->data; - if (!kvm_valid_efer(vcpu, efer)) + if (efer & efer_reserved_bits) return 1; - if (is_paging(vcpu) - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) - return 1; + if (!msr_info->host_initiated) { + if (!__kvm_valid_efer(vcpu, efer)) + return 1; + + if (is_paging(vcpu) && + (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) + return 1; + } efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; @@ -1381,7 +1443,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) vcpu->arch.tsc_always_catchup = 1; return 0; } else { - WARN(1, "user requested TSC rate below hardware speed\n"); + pr_warn_ratelimited("user requested TSC rate below hardware speed\n"); return -1; } } @@ -1391,8 +1453,8 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) user_tsc_khz, tsc_khz); if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { - WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", - user_tsc_khz); + pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); return -1; } @@ -2234,8 +2296,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr_info->host_initiated) vcpu->arch.microcode_version = data; break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated) + return 1; + vcpu->arch.arch_capabilities = data; + break; case MSR_EFER: - return set_efer(vcpu, data); + return set_efer(vcpu, msr_info); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ @@ -2523,6 +2590,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_UCODE_REV: msr_info->data = vcpu->arch.microcode_version; break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return 1; + msr_info->data = vcpu->arch.arch_capabilities; + break; case MSR_MTRRcap: case 0x200 ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -2803,6 +2876,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_ID; + break; case KVM_CAP_NR_MEMSLOTS: r = KVM_USER_MEM_SLOTS; break; @@ -4611,6 +4687,13 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + /* + * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED + * is returned, but our callers are not ready for that and they blindly + * call kvm_inject_page_fault. Ensure that they at least do not leak + * uninitialized kernel stack memory into cr2 and error code. + */ + memset(exception, 0, sizeof(*exception)); return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } @@ -4689,6 +4772,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, /* kvm_write_guest_virt_system can pull in tons of pages. */ vcpu->arch.l1tf_flush_l1d = true; + /* + * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED + * is returned, but our callers are not ready for that and they blindly + * call kvm_inject_page_fault. Ensure that they at least do not leak + * uninitialized kernel stack memory into cr2 and error code. + */ + memset(exception, 0, sizeof(*exception)); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } @@ -5853,8 +5943,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) return EMULATE_DONE; - if (ctxt->have_exception && inject_emulated_exception(vcpu)) + if (ctxt->have_exception) { + /* + * #UD should result in just EMULATION_FAILED, and trap-like + * exception should not be encountered during decode. + */ + WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || + exception_type(ctxt->exception.vector) == EXCPT_TRAP); + inject_emulated_exception(vcpu); return EMULATE_DONE; + } if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; return handle_emulation_failure(vcpu); @@ -5922,12 +6020,13 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE && ctxt->tf) - kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || - exception_type(ctxt->exception.vector) == EXCPT_TRAP) + exception_type(ctxt->exception.vector) == EXCPT_TRAP) { + kvm_rip_write(vcpu, ctxt->eip); + if (r == EMULATE_DONE && ctxt->tf) + kvm_vcpu_do_singlestep(vcpu, &r); __kvm_set_rflags(vcpu, ctxt->eflags); + } /* * For STI, interrupts are shadowed; so KVM_REQ_EVENT will @@ -6072,17 +6171,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != smp_processor_id()) + if (vcpu->cpu != raw_smp_processor_id()) send_ipi = 1; } } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -6219,12 +6318,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; int i; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); atomic_set(&kvm_guest_has_master_clock, 0); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); @@ -6890,10 +6989,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu); + kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); } void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -6964,7 +7063,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb(vcpu); + kvm_vcpu_flush_tlb(vcpu, true); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; @@ -7694,7 +7793,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - if (!is_long_mode(vcpu) && is_pae(vcpu)) { + if (is_pae_paging(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } @@ -7911,6 +8010,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int r; + vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); kvm_vcpu_mtrr_init(vcpu); r = vcpu_load(vcpu); if (r) @@ -8276,6 +8376,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); + INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -8305,6 +8406,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; } +int kvm_arch_post_init_vm(struct kvm *kvm) +{ + return kvm_mmu_post_init_vm(kvm); +} + static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { int r; @@ -8408,6 +8514,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } EXPORT_SYMBOL_GPL(x86_set_memory_region); +void kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ + kvm_mmu_pre_destroy_vm(kvm); +} + void kvm_arch_destroy_vm(struct kvm *kvm) { if (current->mm == kvm->mm) { @@ -8517,13 +8628,13 @@ out_free: return -ENOMEM; } -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ - kvm_mmu_invalidate_mmio_sptes(kvm, slots); + kvm_mmu_invalidate_mmio_sptes(kvm, gen); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -8678,6 +8789,22 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + kvm_test_request(KVM_REQ_SMI, vcpu) || + kvm_test_request(KVM_REQ_EVENT, vcpu)) + return true; + + if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) + return true; + + return false; +} + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return vcpu->arch.preempted_in_kernel; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d4b59cf0dc51..68eb0d03e5fc 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -94,6 +94,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); } +static inline bool is_pae_paging(struct kvm_vcpu *vcpu) +{ + return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu); +} + static inline u32 bit(int bitno) { return 1 << (bitno & 31); @@ -136,6 +141,11 @@ static inline bool emul_is_noncanonical_address(u64 la, static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, unsigned access) { + u64 gen = kvm_memslots(vcpu->kvm)->generation; + + if (unlikely(gen & 1)) + return; + /* * If this is a shadow nested page table, the "GVA" is * actually a nGPA. @@ -143,7 +153,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; vcpu->arch.access = access; vcpu->arch.mmio_gfn = gfn; - vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; + vcpu->arch.mmio_gen = gen; } static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index d435c89875c1..60b410ff31e8 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -6,6 +6,18 @@ # Produces uninteresting flaky coverage. KCOV_INSTRUMENT_delay.o := n +# Early boot use of cmdline; don't instrument it +ifdef CONFIG_AMD_MEM_ENCRYPT +KCOV_INSTRUMENT_cmdline.o := n +KASAN_SANITIZE_cmdline.o := n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_cmdline.o = -pg +endif + +CFLAGS_cmdline.o := $(call cc-option, -fno-stack-protector) +endif + inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c index 2dd1fe13a37b..19f707992db2 100644 --- a/arch/x86/lib/cpu.c +++ b/arch/x86/lib/cpu.c @@ -1,5 +1,6 @@ #include <linux/types.h> #include <linux/export.h> +#include <asm/cpu.h> unsigned int x86_family(unsigned int sig) { diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 4846eff7e4c8..17a0d0f5a1bf 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -113,8 +113,8 @@ static void delay_mwaitx(unsigned long __loops) __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); /* - * AMD, like Intel, supports the EAX hint and EAX=0xf - * means, do not enter any deep C-state and we use it + * AMD, like Intel's MWAIT version, supports the EAX hint and + * EAX=0xf0 means, do not enter any deep C-state and we use it * here in delay() to minimize wakeup latency. */ __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index e0b85930dd77..0a0e9112f284 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -333,7 +333,7 @@ AVXcode: 1 06: CLTS 07: SYSRET (o64) 08: INVD -09: WBINVD +09: WBINVD | WBNOINVD (F3) 0a: 0b: UD2 (1B) 0c: @@ -364,7 +364,7 @@ AVXcode: 1 # a ModR/M byte. 1a: BNDCL Gv,Ev (F3) | BNDCU Gv,Ev (F2) | BNDMOV Gv,Ev (66) | BNDLDX Gv,Ev 1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv -1c: +1c: Grp20 (1A),(1C) 1d: 1e: 1f: NOP Ev @@ -792,6 +792,8 @@ f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) +f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) +f9: MOVDIRI My,Gy EndTable Table: 3-byte opcode 2 (0x0f 0x3a) @@ -943,9 +945,9 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) -2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms 4: SMSW Mw/Rv 5: rdpkru (110),(11B) | wrpkru (111),(11B) @@ -1020,7 +1022,7 @@ GrpTable: Grp15 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE | ptwrite Ey (F3),(11B) 5: XRSTOR | lfence (11B) -6: XSAVEOPT | clwb (66) | mfence (11B) +6: XSAVEOPT | clwb (66) | mfence (11B) | TPAUSE Rd (66),(11B) | UMONITOR Rv (F3),(11B) | UMWAIT Rd (F2),(11B) 7: clflush | clflushopt (66) | sfence (11B) EndTable @@ -1051,6 +1053,10 @@ GrpTable: Grp19 6: vscatterpf1qps/d Wx (66),(ev) EndTable +GrpTable: Grp20 +0: cldemote Mb +EndTable + # AMD's Prefetch Group GrpTable: GrpP 0: PREFETCH diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h index a5a41ec58072..0c122226ca56 100644 --- a/arch/x86/math-emu/fpu_emu.h +++ b/arch/x86/math-emu/fpu_emu.h @@ -177,7 +177,7 @@ static inline void reg_copy(FPU_REG const *x, FPU_REG *y) #define setexponentpos(x,y) { (*(short *)&((x)->exp)) = \ ((y) + EXTENDED_Ebias) & 0x7fff; } #define exponent16(x) (*(short *)&((x)->exp)) -#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (y); } +#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (u16)(y); } #define addexponent(x,y) { (*(short *)&((x)->exp)) += (y); } #define stdexp(x) { (*(short *)&((x)->exp)) += EXTENDED_Ebias; } diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c index 8dc9095bab22..742619e94bdf 100644 --- a/arch/x86/math-emu/reg_constant.c +++ b/arch/x86/math-emu/reg_constant.c @@ -18,7 +18,7 @@ #include "control_w.h" #define MAKE_REG(s, e, l, h) { l, h, \ - ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } + (u16)((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); #if 0 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 794c35c4ca73..27cab342a0b2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -260,13 +260,14 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - if (!pmd_present(*pmd)) + if (pmd_present(*pmd) != pmd_present(*pmd_k)) set_pmd(pmd, *pmd_k); + + if (!pmd_present(*pmd_k)) + return NULL; else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); return pmd_k; } @@ -286,17 +287,13 @@ void vmalloc_sync_all(void) spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; - pmd_t *ret; /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - ret = vmalloc_sync_one(page_address(page), address); + vmalloc_sync_one(page_address(page), address); spin_unlock(pgt_lock); - - if (!ret) - break; } spin_unlock(&pgd_lock); } @@ -426,8 +423,6 @@ static noinline int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - WARN_ON_ONCE(in_nmi()); - /* * Copy kernel mappings over when needed. This can also * happen within a race in page table update. In the later diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index af6f2f9c6a26..5813950b5877 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -194,7 +194,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) if (!IS_ENABLED(CONFIG_X86_5LEVEL)) return (p4d_t *)pgd; - p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; + p4d = pgd_val(*pgd) & PTE_PFN_MASK; p4d += __START_KERNEL_map - phys_base; return (p4d_t *)p4d + p4d_index(addr); } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index aafd4edfa2ac..55338b392221 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -260,7 +260,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) if (pgd_val(pgd) != 0) { pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); - pgdp[i] = native_make_pgd(0); + pgd_clear(&pgdp[i]); paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); @@ -430,7 +430,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(*ptep, entry); if (changed && dirty) - *ptep = entry; + set_pte(ptep, entry); return changed; } @@ -445,7 +445,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { - *pmdp = entry; + set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, @@ -465,7 +465,7 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, VM_BUG_ON(address & ~HPAGE_PUD_MASK); if (changed && dirty) { - *pudp = entry; + set_pud(pudp, entry); /* * We had a write-protection fault here and changed the pud * to to more permissive. No need to flush the TLB for that, @@ -590,8 +590,8 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) fixmaps_set++; } -void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, - pgprot_t flags) +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags) { __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 60c48f5d6b0e..33c6ee9aebbd 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -35,6 +35,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/uaccess.h> +#include <linux/cpu.h> #include <asm/cpufeature.h> #include <asm/hypervisor.h> @@ -91,7 +92,8 @@ void __init pti_check_boottime_disable(void) goto autosel; } - if (cmdline_find_option_bool(boot_command_line, "nopti")) { + if (cmdline_find_option_bool(boot_command_line, "nopti") || + cpu_mitigations_off()) { pti_print_if_insecure("disabled on command line."); return; } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5400a24e1a8c..c5d7b4ae17ca 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -651,9 +651,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * that UV should be updated so that smp_call_function_many(), * etc, are optimal on UV. */ - unsigned int cpu; - - cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) smp_call_function_many(cpumask, flush_tlb_func_remote, diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a9deb2b0397d..cdb386fa7101 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -16,8 +16,6 @@ #include <asm/nospec-branch.h> #include <linux/bpf.h> -int bpf_jit_enable __read_mostly; - /* * assembly code in arch/x86/net/bpf_jit.S */ diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index 526536c81ddc..ca1e8e6dccc8 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -50,8 +50,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func) word1 = read_pci_config_16(bus, slot, func, 0xc0); word2 = read_pci_config_16(bus, slot, func, 0xc2); if (word1 != word2) { - res.start = (word1 << 16) | 0x0000; - res.end = (word2 << 16) | 0xffff; + res.start = ((resource_size_t) word1 << 16) | 0x0000; + res.end = ((resource_size_t) word2 << 16) | 0xffff; res.flags = IORESOURCE_MEM; update_res(info, res.start, res.end, res.flags, 0); } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 4210da7b44de..33e9b4f1ce20 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -589,6 +589,17 @@ static void pci_fixup_amd_ehci_pme(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7808, pci_fixup_amd_ehci_pme); /* + * Device [1022:7914] + * When in D0, PME# doesn't get asserted when plugging USB 2.0 device. + */ +static void pci_fixup_amd_fch_xhci_pme(struct pci_dev *dev) +{ + dev_info(&dev->dev, "PME# does not work under D0, disabling it\n"); + dev->pme_support &= ~(PCI_PM_CAP_PME_D0 >> PCI_PM_CAP_PME_SHIFT); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7914, pci_fixup_amd_fch_xhci_pme); + +/* * Apple MacBook Pro: Avoid [mem 0x7fa00000-0x7fbfffff] * * Using the [mem 0x7fa00000-0x7fbfffff] region, e.g., by assigning it to diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0452629148be..c77f565a04f2 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1118,6 +1118,8 @@ static const struct dmi_system_id pciirq_dmi_table[] __initconst = { void __init pcibios_irq_init(void) { + struct irq_routing_table *rtable = NULL; + DBG(KERN_DEBUG "PCI: IRQ init\n"); if (raw_pci_ops == NULL) @@ -1128,8 +1130,10 @@ void __init pcibios_irq_init(void) pirq_table = pirq_find_routing_table(); #ifdef CONFIG_PCI_BIOS - if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) + if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) { pirq_table = pcibios_get_irq_routing_table(); + rtable = pirq_table; + } #endif if (pirq_table) { pirq_peer_trick(); @@ -1144,8 +1148,10 @@ void __init pcibios_irq_init(void) * If we're using the I/O APIC, avoid using the PCI IRQ * routing table */ - if (io_apic_assign_pci_irqs) + if (io_apic_assign_pci_irqs) { + kfree(rtable); pirq_table = NULL; + } } x86_init.pci.fixup_irqs(); diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c index d49d3be81953..ecb5866aaf84 100644 --- a/arch/x86/platform/atom/punit_atom_debug.c +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -154,8 +154,8 @@ static void punit_dbgfs_unregister(void) (kernel_ulong_t)&drv_data } static const struct x86_cpu_id intel_punit_cpu_ids[] = { - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt), - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng), + ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt), + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng), ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht), {} }; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 9061babfbc83..335a62e74a2e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -893,9 +893,6 @@ static void __init kexec_enter_virtual_mode(void) if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX)) runtime_code_page_mkexec(); - - /* clean DUMMY object */ - efi_delete_dummy_variable(); #endif } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 5b513ccffde4..cadd7fd290fa 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -257,10 +257,6 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) return; } - /* No need to reserve regions that will never be freed. */ - if (md.attribute & EFI_MEMORY_RUNTIME) - return; - size += addr % EFI_PAGE_SIZE; size = round_up(size, EFI_PAGE_SIZE); addr = round_down(addr, EFI_PAGE_SIZE); @@ -290,6 +286,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) early_memunmap(new, new_size); efi_memmap_install(new_phys, num_entries); + e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__update_table(e820_table); } /* diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index 5a0483e7bf66..31dce781364c 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -68,7 +68,7 @@ static struct bt_sfi_data tng_bt_sfi_data __initdata = { { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata } static const struct x86_cpu_id bt_sfi_cpu_ids[] = { - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, tng_bt_sfi_data), + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, tng_bt_sfi_data), {} }; diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 4a6a5a26c582..eb33432f2f24 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -29,7 +29,8 @@ struct uv_systab *uv_systab; -s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) +static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) { struct uv_systab *tab = uv_systab; s64 ret; @@ -51,6 +52,19 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) return ret; } + +s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) +{ + s64 ret; + + if (down_interruptible(&__efi_uv_runtime_lock)) + return BIOS_STATUS_ABORT; + + ret = __uv_bios_call(which, a1, a2, a3, a4, a5); + up(&__efi_uv_runtime_lock); + + return ret; +} EXPORT_SYMBOL_GPL(uv_bios_call); s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, @@ -59,10 +73,15 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, unsigned long bios_flags; s64 ret; + if (down_interruptible(&__efi_uv_runtime_lock)) + return BIOS_STATUS_ABORT; + local_irq_save(bios_flags); - ret = uv_bios_call(which, a1, a2, a3, a4, a5); + ret = __uv_bios_call(which, a1, a2, a3, a4, a5); local_irq_restore(bios_flags); + up(&__efi_uv_runtime_lock); + return ret; } diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 04d5157fe7f8..3aa3149df07f 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -13,6 +13,7 @@ #include <linux/smp.h> #include <linux/perf_event.h> #include <linux/tboot.h> +#include <linux/dmi.h> #include <asm/pgtable.h> #include <asm/proto.h> @@ -24,7 +25,7 @@ #include <asm/debugreg.h> #include <asm/cpu.h> #include <asm/mmu_context.h> -#include <linux/dmi.h> +#include <asm/cpu_device_id.h> #ifdef CONFIG_X86_32 __visible unsigned long saved_context_ebx; @@ -82,12 +83,8 @@ static void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ -#ifdef CONFIG_X86_32 store_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - store_idt((struct desc_ptr *)&ctxt->idt_limit); -#endif + /* * We save it here, but restore it only in the hibernate case. * For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit @@ -103,22 +100,18 @@ static void __save_processor_state(struct saved_context *ctxt) /* * segment registers */ -#ifdef CONFIG_X86_32 - savesegment(es, ctxt->es); - savesegment(fs, ctxt->fs); +#ifdef CONFIG_X86_32_LAZY_GS savesegment(gs, ctxt->gs); - savesegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); +#endif +#ifdef CONFIG_X86_64 + savesegment(gs, ctxt->gs); + savesegment(fs, ctxt->fs); + savesegment(ds, ctxt->ds); + savesegment(es, ctxt->es); rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); mtrr_save_fixed_ranges(NULL); rdmsrl(MSR_EFER, ctxt->efer); @@ -180,6 +173,9 @@ static void fix_processor_context(void) write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); syscall_init(); /* This sets MSR_*STAR and related */ +#else + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ @@ -192,9 +188,12 @@ static void fix_processor_context(void) } /** - * __restore_processor_state - restore the contents of CPU registers saved - * by __save_processor_state() - * @ctxt - structure to load the registers contents from + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + * + * The asm code that gets us here will have restored a usable GDT, although + * it will be pointing to the wrong alias. */ static void notrace __restore_processor_state(struct saved_context *ctxt) { @@ -217,46 +216,52 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); + /* Restore the IDT. */ + load_idt(&ctxt->idt); + /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). + * Just in case the asm code got us here with the SS, DS, or ES + * out of sync with the GDT, update them. */ -#ifdef CONFIG_X86_32 - load_idt(&ctxt->idt); + loadsegment(ss, __KERNEL_DS); + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + /* + * Restore percpu access. Percpu access can happen in exception + * handlers or in complicated helpers like load_gs_index(). + */ +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); #else -/* CONFIG_X86_64 */ - load_idt((const struct desc_ptr *)&ctxt->idt_limit); + loadsegment(fs, __KERNEL_PERCPU); + loadsegment(gs, __KERNEL_STACK_CANARY); #endif + /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */ + fix_processor_context(); + /* - * segment registers + * Now that we have descriptor tables fully restored and working + * exception handling, restore the usermode segments. */ -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_64 + loadsegment(ds, ctxt->es); loadsegment(es, ctxt->es); loadsegment(fs, ctxt->fs); - loadsegment(gs, ctxt->gs); - loadsegment(ss, ctxt->ss); + load_gs_index(ctxt->gs); /* - * sysenter MSRs + * Restore FSBASE and GSBASE after restoring the selectors, since + * restoring the selectors clobbers the bases. Keep in mind + * that MSR_KERNEL_GS_BASE is horribly misnamed. */ - if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); - load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); - wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_GS_BASE, ctxt->gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); +#elif defined(CONFIG_X86_32_LAZY_GS) + loadsegment(gs, ctxt->gs); #endif - fix_processor_context(); - do_fpu_end(); tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); @@ -295,7 +300,17 @@ int hibernate_resume_nonboot_cpu_disable(void) * address in its instruction pointer may not be possible to resolve * any more at that point (the page tables used by it previously may * have been overwritten by hibernate image data). + * + * First, make sure that we wake up all the potentially disabled SMT + * threads which have been initially brought up and then put into + * mwait/cpuidle sleep. + * Those will be put to proper (not interfering with hibernation + * resume) sleep afterwards, and the resumed kernel will decide itself + * what to do with them. */ + ret = cpuhp_smt_enable(); + if (ret) + return ret; smp_ops.play_dead = resume_play_dead; ret = disable_nonboot_cpus(); smp_ops.play_dead = play_dead; @@ -384,15 +399,14 @@ static int __init bsp_pm_check_init(void) core_initcall(bsp_pm_check_init); -static int msr_init_context(const u32 *msr_id, const int total_num) +static int msr_build_context(const u32 *msr_id, const int num) { - int i = 0; + struct saved_msrs *saved_msrs = &saved_context.saved_msrs; struct saved_msr *msr_array; + int total_num; + int i, j; - if (saved_context.saved_msrs.array || saved_context.saved_msrs.num > 0) { - pr_err("x86/pm: MSR quirk already applied, please check your DMI match table.\n"); - return -EINVAL; - } + total_num = saved_msrs->num + num; msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL); if (!msr_array) { @@ -400,19 +414,30 @@ static int msr_init_context(const u32 *msr_id, const int total_num) return -ENOMEM; } - for (i = 0; i < total_num; i++) { - msr_array[i].info.msr_no = msr_id[i]; + if (saved_msrs->array) { + /* + * Multiple callbacks can invoke this function, so copy any + * MSR save requests from previous invocations. + */ + memcpy(msr_array, saved_msrs->array, + sizeof(struct saved_msr) * saved_msrs->num); + + kfree(saved_msrs->array); + } + + for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) { + msr_array[i].info.msr_no = msr_id[j]; msr_array[i].valid = false; msr_array[i].info.reg.q = 0; } - saved_context.saved_msrs.num = total_num; - saved_context.saved_msrs.array = msr_array; + saved_msrs->num = total_num; + saved_msrs->array = msr_array; return 0; } /* - * The following section is a quirk framework for problematic BIOSen: + * The following sections are a quirk framework for problematic BIOSen: * Sometimes MSRs are modified by the BIOSen after suspended to * RAM, this might cause unexpected behavior after wakeup. * Thus we save/restore these specified MSRs across suspend/resume @@ -427,7 +452,7 @@ static int msr_initialize_bdw(const struct dmi_system_id *d) u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL }; pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident); - return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id)); + return msr_build_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id)); } static const struct dmi_system_id msr_save_dmi_table[] = { @@ -442,9 +467,58 @@ static const struct dmi_system_id msr_save_dmi_table[] = { {} }; +static int msr_save_cpuid_features(const struct x86_cpu_id *c) +{ + u32 cpuid_msr_id[] = { + MSR_AMD64_CPUID_FN_1, + }; + + pr_info("x86/pm: family %#hx cpu detected, MSR saving is needed during suspending.\n", + c->family); + + return msr_build_context(cpuid_msr_id, ARRAY_SIZE(cpuid_msr_id)); +} + +static const struct x86_cpu_id msr_save_cpu_table[] = { + { + .vendor = X86_VENDOR_AMD, + .family = 0x15, + .model = X86_MODEL_ANY, + .feature = X86_FEATURE_ANY, + .driver_data = (kernel_ulong_t)msr_save_cpuid_features, + }, + { + .vendor = X86_VENDOR_AMD, + .family = 0x16, + .model = X86_MODEL_ANY, + .feature = X86_FEATURE_ANY, + .driver_data = (kernel_ulong_t)msr_save_cpuid_features, + }, + {} +}; + +typedef int (*pm_cpu_match_t)(const struct x86_cpu_id *); +static int pm_cpu_check(const struct x86_cpu_id *c) +{ + const struct x86_cpu_id *m; + int ret = 0; + + m = x86_match_cpu(msr_save_cpu_table); + if (m) { + pm_cpu_match_t fn; + + fn = (pm_cpu_match_t)m->driver_data; + ret = fn(m); + } + + return ret; +} + static int pm_check_save_msr(void) { dmi_check_system(msr_save_dmi_table); + pm_cpu_check(msr_save_cpu_table); + return 0; } diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 0ef5e5204968..692a179b1ba3 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -13,6 +13,7 @@ #include <linux/suspend.h> #include <linux/scatterlist.h> #include <linux/kdebug.h> +#include <linux/cpu.h> #include <crypto/hash.h> @@ -249,9 +250,9 @@ static int get_e820_md5(struct e820_table *table, void *buf) return ret; } -static void hibernation_e820_save(void *buf) +static int hibernation_e820_save(void *buf) { - get_e820_md5(e820_table_firmware, buf); + return get_e820_md5(e820_table_firmware, buf); } static bool hibernation_e820_mismatch(void *buf) @@ -271,8 +272,9 @@ static bool hibernation_e820_mismatch(void *buf) return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; } #else -static void hibernation_e820_save(void *buf) +static int hibernation_e820_save(void *buf) { + return 0; } static bool hibernation_e820_mismatch(void *buf) @@ -317,9 +319,7 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) rdr->magic = RESTORE_MAGIC; - hibernation_e820_save(rdr->e820_digest); - - return 0; + return hibernation_e820_save(rdr->e820_digest); } /** @@ -347,3 +347,35 @@ int arch_hibernation_header_restore(void *addr) return 0; } + +int arch_resume_nosmt(void) +{ + int ret = 0; + /* + * We reached this while coming out of hibernation. This means + * that SMT siblings are sleeping in hlt, as mwait is not safe + * against control transition during resume (see comment in + * hibernate_resume_nonboot_cpu_disable()). + * + * If the resumed kernel has SMT disabled, we have to take all the + * SMT siblings out of hlt, and offline them again so that they + * end up in mwait proper. + * + * Called with hotplug disabled. + */ + cpu_hotplug_enable(); + if (cpu_smt_control == CPU_SMT_DISABLED || + cpu_smt_control == CPU_SMT_FORCE_DISABLED) { + enum cpuhp_smt_control old = cpu_smt_control; + + ret = cpuhp_smt_enable(); + if (ret) + goto out; + ret = cpuhp_smt_disable(old); + if (ret) + goto out; + } +out: + cpu_hotplug_disable(); + return ret; +} diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 4463fa72db94..96cb20de08af 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -47,7 +47,7 @@ $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE targets += realmode.lds $(obj)/realmode.lds: $(obj)/pasyms.h -LDFLAGS_realmode.elf := --emit-relocs -T +LDFLAGS_realmode.elf := -m elf_i386 --emit-relocs -T CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj) targets += realmode.elf diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index b02a36b2c14f..a42015b305f4 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -69,7 +69,7 @@ BEGIN { lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)" + lprefix3_expr = "\\((F2|!F3|66&F2)\\)" lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 @@ -257,7 +257,7 @@ function convert_operands(count,opnd, i,j,imm,mod) return add_flags(imm, mod) } -/^[0-9a-f]+\:/ { +/^[0-9a-f]+:/ { if (NR == 1) next # get index diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index a18703be9ead..4769a069d5bd 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -77,7 +77,9 @@ static efi_system_table_t __init *xen_efi_probe(void) efi.get_variable = xen_efi_get_variable; efi.get_next_variable = xen_efi_get_next_variable; efi.set_variable = xen_efi_set_variable; + efi.set_variable_nonblocking = xen_efi_set_variable; efi.query_variable_info = xen_efi_query_variable_info; + efi.query_variable_info_nonblocking = xen_efi_query_variable_info; efi.update_capsule = xen_efi_update_capsule; efi.query_capsule_caps = xen_efi_query_capsule_caps; efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 515d5e4414c2..00fc683a2011 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -259,19 +259,41 @@ void xen_reboot(int reason) BUG(); } +static int reboot_reason = SHUTDOWN_reboot; +static bool xen_legacy_crash; void xen_emergency_restart(void) { - xen_reboot(SHUTDOWN_reboot); + xen_reboot(reboot_reason); } static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { - if (!kexec_crash_loaded()) - xen_reboot(SHUTDOWN_crash); + if (!kexec_crash_loaded()) { + if (xen_legacy_crash) + xen_reboot(SHUTDOWN_crash); + + reboot_reason = SHUTDOWN_crash; + + /* + * If panic_timeout==0 then we are supposed to wait forever. + * However, to preserve original dom0 behavior we have to drop + * into hypervisor. (domU behavior is controlled by its + * config file) + */ + if (panic_timeout == 0) + panic_timeout = -1; + } return NOTIFY_DONE; } +static int __init parse_xen_legacy_crash(char *arg) +{ + xen_legacy_crash = true; + return 0; +} +early_param("xen_legacy_crash", parse_xen_legacy_crash); + static struct notifier_block xen_panic_block = { .notifier_call = xen_panic_event, .priority = INT_MIN diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index fd173e6425cc..f79a0cdc6b4e 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -598,12 +598,12 @@ struct trap_array_entry { static struct trap_array_entry trap_array[] = { { debug, xen_xendebug, true }, - { int3, xen_xenint3, true }, { double_fault, xen_double_fault, true }, #ifdef CONFIG_X86_MCE { machine_check, xen_machine_check, true }, #endif { nmi, xen_xennmi, true }, + { int3, xen_int3, false }, { overflow, xen_overflow, false }, #ifdef CONFIG_IA32_EMULATION { entry_INT80_compat, xen_entry_INT80_compat, false }, @@ -900,10 +900,7 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) val = native_read_msr_safe(msr, err); switch (msr) { case MSR_IA32_APICBASE: -#ifdef CONFIG_X86_X2APIC - if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31)))) -#endif - val &= ~X2APIC_ENABLE; + val &= ~X2APIC_ENABLE; break; } return val; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 7631e6130d44..44b1f1334ef8 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2080,10 +2080,10 @@ void __init xen_relocate_p2m(void) pt = early_memremap(pt_phys, PAGE_SIZE); clear_page(pt); for (idx_pte = 0; - idx_pte < min(n_pte, PTRS_PER_PTE); - idx_pte++) { - set_pte(pt + idx_pte, - pfn_pte(p2m_pfn, PAGE_KERNEL)); + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + pt[idx_pte] = pfn_pte(p2m_pfn, + PAGE_KERNEL); p2m_pfn++; } n_pte -= PTRS_PER_PTE; @@ -2091,8 +2091,7 @@ void __init xen_relocate_p2m(void) make_lowmem_page_readonly(__va(pt_phys)); pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, PFN_DOWN(pt_phys)); - set_pmd(pmd + idx_pt, - __pmd(_PAGE_TABLE | pt_phys)); + pmd[idx_pt] = __pmd(_PAGE_TABLE | pt_phys); pt_phys += PAGE_SIZE; } n_pt -= PTRS_PER_PMD; @@ -2100,7 +2099,7 @@ void __init xen_relocate_p2m(void) make_lowmem_page_readonly(__va(pmd_phys)); pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, PFN_DOWN(pmd_phys)); - set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pud[idx_pmd] = __pud(_PAGE_TABLE | pmd_phys); pmd_phys += PAGE_SIZE; } n_pmd -= PTRS_PER_PUD; diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 417b339e5c8e..a93d8a7cef26 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -12,6 +12,7 @@ #include <asm/segment.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> +#include <asm/asm.h> #include <xen/interface/xen.h> @@ -24,13 +25,13 @@ ENTRY(xen_\name) pop %r11 jmp \name END(xen_\name) +_ASM_NOKPROBE(xen_\name) .endm xen_pv_trap divide_error xen_pv_trap debug xen_pv_trap xendebug xen_pv_trap int3 -xen_pv_trap xenint3 xen_pv_trap xennmi xen_pv_trap overflow xen_pv_trap bounds |