summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig64
-rw-r--r--arch/x86/Kconfig.debug2
-rw-r--r--arch/x86/Makefile3
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/head_32.S2
-rw-r--r--arch/x86/boot/compressed/head_64.S9
-rw-r--r--arch/x86/boot/compressed/misc.c1
-rw-r--r--arch/x86/boot/compressed/misc.h1
-rw-r--r--arch/x86/crypto/crct10dif-pclmul_glue.c13
-rw-r--r--arch/x86/crypto/poly1305-avx2-x86_64.S14
-rw-r--r--arch/x86/crypto/poly1305-sse2-x86_64.S22
-rw-r--r--arch/x86/entry/calling.h18
-rw-r--r--arch/x86/entry/common.c3
-rw-r--r--arch/x86/entry/entry_32.S3
-rw-r--r--arch/x86/entry/entry_64.S23
-rw-r--r--arch/x86/entry/vdso/Makefile23
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/events/amd/core.c147
-rw-r--r--arch/x86/events/amd/ibs.c21
-rw-r--r--arch/x86/events/amd/uncore.c112
-rw-r--r--arch/x86/events/core.c9
-rw-r--r--arch/x86/events/intel/core.c26
-rw-r--r--arch/x86/events/intel/cstate.c4
-rw-r--r--arch/x86/events/intel/ds.c30
-rw-r--r--arch/x86/events/msr.c4
-rw-r--r--arch/x86/ia32/ia32_signal.c29
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/atomic.h21
-rw-r--r--arch/x86/include/asm/atomic64_64.h8
-rw-r--r--arch/x86/include/asm/barrier.h4
-rw-r--r--arch/x86/include/asm/bootparam_utils.h61
-rw-r--r--arch/x86/include/asm/cpufeatures.h22
-rw-r--r--arch/x86/include/asm/crash.h2
-rw-r--r--arch/x86/include/asm/fixmap.h2
-rw-r--r--arch/x86/include/asm/insn.h18
-rw-r--r--arch/x86/include/asm/intel-family.h33
-rw-r--r--arch/x86/include/asm/irqflags.h4
-rw-r--r--arch/x86/include/asm/kexec.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h41
-rw-r--r--arch/x86/include/asm/microcode_intel.h15
-rw-r--r--arch/x86/include/asm/msr-index.h57
-rw-r--r--arch/x86/include/asm/mwait.h9
-rw-r--r--arch/x86/include/asm/nospec-branch.h67
-rw-r--r--arch/x86/include/asm/perf_event.h12
-rw-r--r--arch/x86/include/asm/pgtable_64.h16
-rw-r--r--arch/x86/include/asm/processor.h31
-rw-r--r--arch/x86/include/asm/ptrace.h48
-rw-r--r--arch/x86/include/asm/smp.h10
-rw-r--r--arch/x86/include/asm/spec-ctrl.h20
-rw-r--r--arch/x86/include/asm/stacktrace.h5
-rw-r--r--arch/x86/include/asm/suspend_32.h8
-rw-r--r--arch/x86/include/asm/suspend_64.h19
-rw-r--r--arch/x86/include/asm/switch_to.h4
-rw-r--r--arch/x86/include/asm/thread_info.h20
-rw-r--r--arch/x86/include/asm/tlbflush.h8
-rw-r--r--arch/x86/include/asm/uaccess.h4
-rw-r--r--arch/x86/include/asm/vgtod.h7
-rw-r--r--arch/x86/include/asm/xen/hypercall.h3
-rw-r--r--arch/x86/include/uapi/asm/Kbuild1
-rw-r--r--arch/x86/include/uapi/asm/mce.h2
-rw-r--r--arch/x86/kernel/acpi/boot.c2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S9
-rw-r--r--arch/x86/kernel/apic/apic.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c24
-rw-r--r--arch/x86/kernel/apic/io_apic.c17
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c73
-rw-r--r--arch/x86/kernel/cpu/bugs.c946
-rw-r--r--arch/x86/kernel/cpu/common.c178
-rw-r--r--arch/x86/kernel/cpu/cpu.h18
-rw-r--r--arch/x86/kernel/cpu/cyrix.c16
-rw-r--r--arch/x86/kernel/cpu/intel.c16
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c80
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c86
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c22
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c70
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh2
-rw-r--r--arch/x86/kernel/cpu/tsx.c141
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c1
-rw-r--r--arch/x86/kernel/irq_64.c19
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c30
-rw-r--r--arch/x86/kernel/nmi.c4
-rw-r--r--arch/x86/kernel/process.c101
-rw-r--r--arch/x86/kernel/process.h39
-rw-r--r--arch/x86/kernel/process_32.c16
-rw-r--r--arch/x86/kernel/process_64.c17
-rw-r--r--arch/x86/kernel/ptrace.c4
-rw-r--r--arch/x86/kernel/reboot.c21
-rw-r--r--arch/x86/kernel/signal.c29
-rw-r--r--arch/x86/kernel/smp.c46
-rw-r--r--arch/x86/kernel/sysfb_efi.c46
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c2
-rw-r--r--arch/x86/kernel/tls.c9
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/unwind_frame.c20
-rw-r--r--arch/x86/kernel/uprobes.c23
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/kvm/cpuid.c34
-rw-r--r--arch/x86/kvm/cpuid.h2
-rw-r--r--arch/x86/kvm/emulate.c74
-rw-r--r--arch/x86/kvm/hyperv.c11
-rw-r--r--arch/x86/kvm/ioapic.c15
-rw-r--r--arch/x86/kvm/irq_comm.c2
-rw-r--r--arch/x86/kvm/lapic.c20
-rw-r--r--arch/x86/kvm/mmu.c460
-rw-r--r--arch/x86/kvm/mmu.h21
-rw-r--r--arch/x86/kvm/mmutrace.h59
-rw-r--r--arch/x86/kvm/mtrr.c9
-rw-r--r--arch/x86/kvm/paging_tmpl.h79
-rw-r--r--arch/x86/kvm/pmu.c4
-rw-r--r--arch/x86/kvm/pmu.h18
-rw-r--r--arch/x86/kvm/pmu_intel.c37
-rw-r--r--arch/x86/kvm/svm.c37
-rw-r--r--arch/x86/kvm/trace.h4
-rw-r--r--arch/x86/kvm/vmx.c246
-rw-r--r--arch/x86/kvm/x86.c191
-rw-r--r--arch/x86/lib/cpu.c1
-rw-r--r--arch/x86/lib/delay.c4
-rw-r--r--arch/x86/lib/x86-opcode-map.txt20
-rw-r--r--arch/x86/math-emu/fpu_emu.h2
-rw-r--r--arch/x86/math-emu/reg_constant.c2
-rw-r--r--arch/x86/mm/fault.c43
-rw-r--r--arch/x86/mm/gup.c41
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/kaiser.c4
-rw-r--r--arch/x86/mm/pgtable.c10
-rw-r--r--arch/x86/mm/tlb.c114
-rw-r--r--arch/x86/net/bpf_jit_comp.c2
-rw-r--r--arch/x86/pci/fixup.c11
-rw-r--r--arch/x86/pci/irq.c10
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c4
-rw-r--r--arch/x86/platform/efi/efi.c16
-rw-r--r--arch/x86/power/cpu.c192
-rw-r--r--arch/x86/power/hibernate_64.c33
-rw-r--r--arch/x86/realmode/rm/Makefile2
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk4
-rw-r--r--arch/x86/um/os-Linux/registers.c30
-rw-r--r--arch/x86/um/user-offsets.c6
142 files changed, 4082 insertions, 1310 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e31001ec4c07..80636caee07c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -937,13 +937,7 @@ config NR_CPUS
approximately eight kilobytes to the kernel image.
config SCHED_SMT
- bool "SMT (Hyperthreading) scheduler support"
- depends on SMP
- ---help---
- SMT scheduler support improves the CPU scheduler's decision making
- when dealing with Intel Pentium 4 chips with HyperThreading at a
- cost of slightly increased overhead in some places. If unsure say
- N here.
+ def_bool y if SMP
config SCHED_MC
def_bool y
@@ -1761,6 +1755,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
If unsure, say y.
+choice
+ prompt "TSX enable mode"
+ depends on CPU_SUP_INTEL
+ default X86_INTEL_TSX_MODE_OFF
+ help
+ Intel's TSX (Transactional Synchronization Extensions) feature
+ allows to optimize locking protocols through lock elision which
+ can lead to a noticeable performance boost.
+
+ On the other hand it has been shown that TSX can be exploited
+ to form side channel attacks (e.g. TAA) and chances are there
+ will be more of those attacks discovered in the future.
+
+ Therefore TSX is not enabled by default (aka tsx=off). An admin
+ might override this decision by tsx=on the command line parameter.
+ Even with TSX enabled, the kernel will attempt to enable the best
+ possible TAA mitigation setting depending on the microcode available
+ for the particular machine.
+
+ This option allows to set the default tsx mode between tsx=on, =off
+ and =auto. See Documentation/kernel-parameters.txt for more
+ details.
+
+ Say off if not sure, auto if TSX is in use but it should be used on safe
+ platforms or on if TSX is in use and the security aspect of tsx is not
+ relevant.
+
+config X86_INTEL_TSX_MODE_OFF
+ bool "off"
+ help
+ TSX is disabled if possible - equals to tsx=off command line parameter.
+
+config X86_INTEL_TSX_MODE_ON
+ bool "on"
+ help
+ TSX is always enabled on TSX capable HW - equals the tsx=on command
+ line parameter.
+
+config X86_INTEL_TSX_MODE_AUTO
+ bool "auto"
+ help
+ TSX is enabled on TSX capable HW that is believed to be safe against
+ side channel attacks- equals the tsx=auto command line parameter.
+endchoice
+
config EFI
bool "EFI runtime service support"
depends on ACPI
@@ -2051,14 +2090,8 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING
If unsure, leave at the default value.
config HOTPLUG_CPU
- bool "Support for hot-pluggable CPUs"
+ def_bool y
depends on SMP
- ---help---
- Say Y here to allow turning CPUs off and on. CPUs can be
- controlled through /sys/devices/system/cpu.
- ( Note: power management support will enable this option
- automatically on SMP systems. )
- Say N if you want to disable CPU hotplug.
config BOOTPARAM_HOTPLUG_CPU0
bool "Set default setting of cpu0_hotpluggable"
@@ -2581,8 +2614,7 @@ config OLPC
config OLPC_XO1_PM
bool "OLPC XO-1 Power Management"
- depends on OLPC && MFD_CS5535 && PM_SLEEP
- select MFD_CORE
+ depends on OLPC && MFD_CS5535=y && PM_SLEEP
---help---
Add support for poweroff and suspend of the OLPC XO-1 laptop.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 4386440fe463..f09a192260f8 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -192,7 +192,7 @@ config HAVE_MMIOTRACE_SUPPORT
config X86_DECODER_SELFTEST
bool "x86 instruction decoder selftest"
- depends on DEBUG_KERNEL && KPROBES
+ depends on DEBUG_KERNEL && INSTRUCTION_DECODER
depends on !COMPILE_TEST
---help---
Perform x86 instruction decoder selftests at build time.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b5226a009973..940ed27a6212 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -38,6 +38,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
export REALMODE_CFLAGS
@@ -47,7 +48,7 @@ export REALMODE_CFLAGS
export BITS
ifdef CONFIG_X86_NEED_RELOCS
- LDFLAGS_vmlinux := --emit-relocs
+ LDFLAGS_vmlinux := --emit-relocs --discard-none
endif
#
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 3b7156f46bc1..3b16935b22bc 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -100,7 +100,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
AFLAGS_header.o += -I$(objtree)/$(obj)
$(obj)/header.o: $(obj)/zoffset.h
-LDFLAGS_setup.elf := -T
+LDFLAGS_setup.elf := -m elf_i386 -T
$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index fd0b6a272dd5..7532f6f53677 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -170,7 +170,7 @@ preferred_addr:
notl %eax
andl %eax, %ebx
cmpl $LOAD_PHYSICAL_ADDR, %ebx
- jge 1f
+ jae 1f
#endif
movl $LOAD_PHYSICAL_ADDR, %ebx
1:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index efdfba21a5b2..3fac2d133e4e 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -104,7 +104,7 @@ ENTRY(startup_32)
notl %eax
andl %eax, %ebx
cmpl $LOAD_PHYSICAL_ADDR, %ebx
- jge 1f
+ jae 1f
#endif
movl $LOAD_PHYSICAL_ADDR, %ebx
1:
@@ -227,6 +227,11 @@ ENTRY(efi32_stub_entry)
leal efi32_config(%ebp), %eax
movl %eax, efi_config(%ebp)
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
jmp startup_32
ENDPROC(efi32_stub_entry)
#endif
@@ -334,7 +339,7 @@ preferred_addr:
notq %rax
andq %rax, %rbp
cmpq $LOAD_PHYSICAL_ADDR, %rbp
- jge 1f
+ jae 1f
#endif
movq $LOAD_PHYSICAL_ADDR, %rbp
1:
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index d86e68d3c794..1912b2671f10 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -15,6 +15,7 @@
#include "error.h"
#include "../string.h"
#include "../voffset.h"
+#include <asm/bootparam_utils.h>
/*
* WARNING!!
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 2728e1b7e4a6..a8789aa647b4 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -19,7 +19,6 @@
#include <asm/page.h>
#include <asm/boot.h>
#include <asm/bootparam.h>
-#include <asm/bootparam_utils.h>
#define BOOT_BOOT_H
#include "../ctype.h"
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index cd4df9322501..7bbfe7d35da7 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -76,15 +76,14 @@ static int chksum_final(struct shash_desc *desc, u8 *out)
return 0;
}
-static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
- u8 *out)
+static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
{
if (irq_fpu_usable()) {
kernel_fpu_begin();
- *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
+ *(__u16 *)out = crc_t10dif_pcl(crc, data, len);
kernel_fpu_end();
} else
- *(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
+ *(__u16 *)out = crc_t10dif_generic(crc, data, len);
return 0;
}
@@ -93,15 +92,13 @@ static int chksum_finup(struct shash_desc *desc, const u8 *data,
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
- return __chksum_finup(&ctx->crc, data, len, out);
+ return __chksum_finup(ctx->crc, data, len, out);
}
static int chksum_digest(struct shash_desc *desc, const u8 *data,
unsigned int length, u8 *out)
{
- struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
- return __chksum_finup(&ctx->crc, data, length, out);
+ return __chksum_finup(0, data, length, out);
}
static struct shash_alg alg = {
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S
index eff2f414e22b..ec234c43b3f4 100644
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -321,6 +321,12 @@ ENTRY(poly1305_4block_avx2)
vpaddq t2,t1,t1
vmovq t1x,d4
+ # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+ # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+ # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+ # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+ # integers. It's true in a single-block implementation, but not here.
+
# d1 += d0 >> 26
mov d0,%rax
shr $26,%rax
@@ -359,16 +365,16 @@ ENTRY(poly1305_4block_avx2)
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
- lea (%eax,%eax,4),%eax
- add %eax,%ebx
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
- mov %ebx,%eax
- shr $26,%eax
+ mov %rbx,%rax
+ shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
index 338c748054ed..639d9760b089 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -251,16 +251,16 @@ ENTRY(poly1305_block_sse2)
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
- lea (%eax,%eax,4),%eax
- add %eax,%ebx
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
- mov %ebx,%eax
- shr $26,%eax
+ mov %rbx,%rax
+ shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
@@ -518,6 +518,12 @@ ENTRY(poly1305_2block_sse2)
paddq t2,t1
movq t1,d4
+ # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+ # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+ # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+ # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+ # integers. It's true in a single-block implementation, but not here.
+
# d1 += d0 >> 26
mov d0,%rax
shr $26,%rax
@@ -556,16 +562,16 @@ ENTRY(poly1305_2block_sse2)
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
- lea (%eax,%eax,4),%eax
- add %eax,%ebx
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
- mov %ebx,%eax
- shr $26,%eax
+ mov %rbx,%rax
+ shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9a9e5884066c..8af8c070f213 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,4 +1,5 @@
#include <linux/jump_label.h>
+#include <asm/cpufeatures.h>
/*
@@ -201,6 +202,23 @@ For 32-bit we have the following conventions - kernel is built with
.byte 0xf1
.endm
+/*
+ * Mitigate Spectre v1 for conditional swapgs code paths.
+ *
+ * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to
+ * prevent a speculative swapgs when coming from kernel space.
+ *
+ * FENCE_SWAPGS_KERNEL_ENTRY is used in the kernel entry non-swapgs code path,
+ * to prevent the swapgs from getting speculatively skipped when coming from
+ * user space.
+ */
+.macro FENCE_SWAPGS_USER_ENTRY
+ ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_USER
+.endm
+.macro FENCE_SWAPGS_KERNEL_ENTRY
+ ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL
+.endm
+
#endif /* CONFIG_X86_64 */
/*
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index b0cd306dc527..8841d016b4a4 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -28,6 +28,7 @@
#include <asm/vdso.h>
#include <asm/uaccess.h>
#include <asm/cpufeature.h>
+#include <asm/nospec-branch.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -206,6 +207,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
#endif
user_enter_irqoff();
+
+ mds_user_clear_cpu_buffers();
}
#define SYSCALL_EXIT_WORK_FLAGS \
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a76dc738ec61..4d980d11e2d1 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -219,6 +219,7 @@ ENTRY(__switch_to_asm)
pushl %ebx
pushl %edi
pushl %esi
+ pushfl
/* switch stack */
movl %esp, TASK_threadsp(%eax)
@@ -241,6 +242,7 @@ ENTRY(__switch_to_asm)
#endif
/* restore callee-saved registers */
+ popfl
popl %esi
popl %edi
popl %ebx
@@ -1193,6 +1195,7 @@ ENTRY(int3)
END(int3)
ENTRY(general_protection)
+ ASM_CLAC
pushl $do_general_protection
jmp error_code
END(general_protection)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 870e941c1947..10ecfba43dff 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -313,6 +313,7 @@ ENTRY(__switch_to_asm)
pushq %r13
pushq %r14
pushq %r15
+ pushfq
/* switch stack */
movq %rsp, TASK_threadsp(%rdi)
@@ -335,6 +336,7 @@ ENTRY(__switch_to_asm)
#endif
/* restore callee-saved registers */
+ popfq
popq %r15
popq %r14
popq %r13
@@ -418,6 +420,7 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
SWITCH_KERNEL_CR3
/*
@@ -431,8 +434,10 @@ END(irq_entries_start)
TRACE_IRQS_OFF
CALL_enter_from_user_mode
-
+ jmp 2f
1:
+ FENCE_SWAPGS_KERNEL_ENTRY
+2:
/*
* Save previous stack pointer, optionally switch to interrupt stack.
* irq_count is used to check if a CPU is already on an interrupt stack
@@ -1002,6 +1007,13 @@ ENTRY(paranoid_entry)
movq %rax, %cr3
2:
#endif
+ /*
+ * The above doesn't do an unconditional CR3 write, even in the PTI
+ * case. So do an lfence to prevent GS speculation, regardless of
+ * whether PTI is enabled.
+ */
+ FENCE_SWAPGS_KERNEL_ENTRY
+
ret
END(paranoid_entry)
@@ -1063,6 +1075,7 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
.Lerror_entry_from_usermode_after_swapgs:
/*
@@ -1074,6 +1087,8 @@ ENTRY(error_entry)
CALL_enter_from_user_mode
ret
+.Lerror_entry_done_lfence:
+ FENCE_SWAPGS_KERNEL_ENTRY
.Lerror_entry_done:
TRACE_IRQS_OFF
ret
@@ -1092,7 +1107,7 @@ ENTRY(error_entry)
cmpq %rax, RIP+8(%rsp)
je .Lbstep_iret
cmpq $.Lgs_change, RIP+8(%rsp)
- jne .Lerror_entry_done
+ jne .Lerror_entry_done_lfence
/*
* hack: .Lgs_change can fail with user gsbase. If this happens, fix up
@@ -1100,6 +1115,7 @@ ENTRY(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
jmp .Lerror_entry_done
.Lbstep_iret:
@@ -1113,6 +1129,7 @@ ENTRY(error_entry)
* Switch to kernel gsbase:
*/
SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
/*
* Pretend that the exception came from user mode: set up pt_regs
@@ -1209,6 +1226,7 @@ ENTRY(nmi)
* to switch CR3 here.
*/
cld
+ FENCE_SWAPGS_USER_ENTRY
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq 5*8(%rdx) /* pt_regs->ss */
@@ -1497,6 +1515,7 @@ end_repeat_nmi:
movq %rax, %cr3
2:
#endif
+ FENCE_SWAPGS_KERNEL_ENTRY
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
call do_nmi
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index d5409660f5de..0d3ebdfa0739 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -47,10 +47,8 @@ targets += $(vdso_img_sodbg)
export CPPFLAGS_vdso.lds += -P -C
-VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
- -Wl,--no-undefined \
- -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \
- $(DISABLE_LTO)
+VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
+ -z max-page-size=4096
$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
$(call if_changed,vdso)
@@ -96,10 +94,8 @@ CFLAGS_REMOVE_vvar.o = -pg
#
CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
-VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
- -Wl,-soname=linux-vdso.so.1 \
- -Wl,-z,max-page-size=4096 \
- -Wl,-z,common-page-size=4096
+VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
+ -z max-page-size=4096
# 64-bit objects to re-brand as x32
vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y))
@@ -127,7 +123,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
-VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
+VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
# This makes sure the $(obj) subdirectory exists even though vdso32/
# is not a kbuild sub-make subdirectory.
@@ -165,13 +161,14 @@ $(obj)/vdso32.so.dbg: FORCE \
# The DSO images are built using a special linker script.
#
quiet_cmd_vdso = VDSO $@
- cmd_vdso = $(CC) -nostdlib -o $@ \
+ cmd_vdso = $(LD) -nostdlib -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
- -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
+ -T $(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
-VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
- $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
+VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \
+ $(call ld-option, --build-id) $(call ld-option, --eh-frame-hdr) \
+ -Bsymbolic
GCOV_PROFILE := n
#
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 3f9d1a83891a..50c1f77cab15 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -10,6 +10,7 @@
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/mm_types.h>
+#include <linux/elf.h>
#include <asm/processor.h>
#include <asm/vdso.h>
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index afb222b63cae..836b7e4a2005 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -112,23 +112,145 @@ static __initconst const u64 amd_hw_cache_event_ids
},
};
+static __initconst const u64 amd_hw_cache_event_ids_f17h
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0040, /* Data Cache Accesses */
+ [C(RESULT_MISS)] = 0xc860, /* L2$ access from DC Miss */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0xff5a, /* h/w prefetch DC Fills */
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(L1I)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0080, /* Instruction cache fetches */
+ [C(RESULT_MISS)] = 0x0081, /* Instruction cache misses */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(DTLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0xff45, /* All L2 DTLB accesses */
+ [C(RESULT_MISS)] = 0xf045, /* L2 DTLB misses (PT walks) */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(ITLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0084, /* L1 ITLB misses, L2 ITLB hits */
+ [C(RESULT_MISS)] = 0xff85, /* L1 ITLB misses, L2 misses */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(BPU)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c2, /* Retired Branch Instr. */
+ [C(RESULT_MISS)] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(NODE)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+};
+
/*
- * AMD Performance Monitor K7 and later.
+ * AMD Performance Monitor K7 and later, up to and including Family 16h:
*/
static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] =
{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x077e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
- [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
- [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x077e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
+};
+
+/*
+ * AMD Performance Monitor Family 17h and later:
+ */
+static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0964,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287,
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187,
};
static u64 amd_pmu_event_map(int hw_event)
{
+ if (boot_cpu_data.x86 >= 0x17)
+ return amd_f17h_perfmon_event_map[hw_event];
+
return amd_perfmon_event_map[hw_event];
}
@@ -714,9 +836,10 @@ __init int amd_pmu_init(void)
x86_pmu.amd_nb_constraints = 0;
}
- /* Events are common for all AMDs */
- memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
+ if (boot_cpu_data.x86 >= 0x17)
+ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids_f17h, sizeof(hw_cache_event_ids));
+ else
+ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids));
return 0;
}
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index fd4484ae3ffc..5f72b473f3ed 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -388,7 +388,8 @@ static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 config)
{
config &= ~perf_ibs->cnt_mask;
- wrmsrl(hwc->config_base, config);
+ if (boot_cpu_data.x86 == 0x10)
+ wrmsrl(hwc->config_base, config);
config &= ~perf_ibs->enable_mask;
wrmsrl(hwc->config_base, config);
}
@@ -563,7 +564,8 @@ static struct perf_ibs perf_ibs_op = {
},
.msr = MSR_AMD64_IBSOPCTL,
.config_mask = IBS_OP_CONFIG_MASK,
- .cnt_mask = IBS_OP_MAX_CNT,
+ .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
+ IBS_OP_CUR_CNT_RAND,
.enable_mask = IBS_OP_ENABLE,
.valid_mask = IBS_OP_VAL,
.max_period = IBS_OP_MAX_CNT << 4,
@@ -624,7 +626,7 @@ fail:
if (event->attr.sample_type & PERF_SAMPLE_RAW)
offset_max = perf_ibs->offset_max;
else if (check_rip)
- offset_max = 2;
+ offset_max = 3;
else
offset_max = 1;
do {
@@ -671,10 +673,17 @@ fail:
throttle = perf_event_overflow(event, &data, &regs);
out:
- if (throttle)
+ if (throttle) {
perf_ibs_stop(event, 0);
- else
- perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+ } else {
+ period >>= 4;
+
+ if ((ibs_caps & IBS_CAPS_RDWROPCNT) &&
+ (*config & IBS_OP_CNT_CTL))
+ period |= *config & IBS_OP_CUR_CNT_RAND;
+
+ perf_ibs_enable_event(perf_ibs, hwc, period);
+ }
perf_event_update_userpage(event);
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 65577f081d07..6bfb9a68134c 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -19,13 +19,14 @@
#include <asm/cpufeature.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
+#include <asm/smp.h>
#define NUM_COUNTERS_NB 4
#define NUM_COUNTERS_L2 4
#define MAX_COUNTERS NUM_COUNTERS_NB
#define RDPMC_BASE_NB 6
-#define RDPMC_BASE_L2 10
+#define RDPMC_BASE_LLC 10
#define COUNTER_SHIFT 16
@@ -45,30 +46,30 @@ struct amd_uncore {
};
static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_l2;
+static struct amd_uncore * __percpu *amd_uncore_llc;
static struct pmu amd_nb_pmu;
-static struct pmu amd_l2_pmu;
+static struct pmu amd_llc_pmu;
static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_l2_active_mask;
+static cpumask_t amd_llc_active_mask;
static bool is_nb_event(struct perf_event *event)
{
return event->pmu->type == amd_nb_pmu.type;
}
-static bool is_l2_event(struct perf_event *event)
+static bool is_llc_event(struct perf_event *event)
{
- return event->pmu->type == amd_l2_pmu.type;
+ return event->pmu->type == amd_llc_pmu.type;
}
static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
{
if (is_nb_event(event) && amd_uncore_nb)
return *per_cpu_ptr(amd_uncore_nb, event->cpu);
- else if (is_l2_event(event) && amd_uncore_l2)
- return *per_cpu_ptr(amd_uncore_l2, event->cpu);
+ else if (is_llc_event(event) && amd_uncore_llc)
+ return *per_cpu_ptr(amd_uncore_llc, event->cpu);
return NULL;
}
@@ -183,21 +184,19 @@ static int amd_uncore_event_init(struct perf_event *event)
return -ENOENT;
/*
- * NB and L2 counters (MSRs) are shared across all cores that share the
- * same NB / L2 cache. Interrupts can be directed to a single target
- * core, however, event counts generated by processes running on other
- * cores cannot be masked out. So we do not support sampling and
- * per-thread events.
+ * NB and Last level cache counters (MSRs) are shared across all cores
+ * that share the same NB / Last level cache. On family 16h and below,
+ * Interrupts can be directed to a single target core, however, event
+ * counts generated by processes running on other cores cannot be masked
+ * out. So we do not support sampling and per-thread events via
+ * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts:
*/
- if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
- return -EINVAL;
- /* NB and L2 counters do not have usr/os/guest/host bits */
+ /* NB and Last level cache counters do not have usr/os/guest/host bits */
if (event->attr.exclude_user || event->attr.exclude_kernel ||
event->attr.exclude_host || event->attr.exclude_guest)
return -EINVAL;
- /* and we do not enable counter overflow interrupts */
hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
hwc->idx = -1;
@@ -226,8 +225,8 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
if (pmu->type == amd_nb_pmu.type)
active_mask = &amd_nb_active_mask;
- else if (pmu->type == amd_l2_pmu.type)
- active_mask = &amd_l2_active_mask;
+ else if (pmu->type == amd_llc_pmu.type)
+ active_mask = &amd_llc_active_mask;
else
return 0;
@@ -274,9 +273,10 @@ static struct pmu amd_nb_pmu = {
.start = amd_uncore_start,
.stop = amd_uncore_stop,
.read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
};
-static struct pmu amd_l2_pmu = {
+static struct pmu amd_llc_pmu = {
.task_ctx_nr = perf_invalid_context,
.attr_groups = amd_uncore_attr_groups,
.name = "amd_l2",
@@ -286,6 +286,7 @@ static struct pmu amd_l2_pmu = {
.start = amd_uncore_start,
.stop = amd_uncore_stop,
.read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
};
static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
@@ -296,7 +297,7 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
static int amd_uncore_cpu_up_prepare(unsigned int cpu)
{
- struct amd_uncore *uncore_nb = NULL, *uncore_l2;
+ struct amd_uncore *uncore_nb = NULL, *uncore_llc;
if (amd_uncore_nb) {
uncore_nb = amd_uncore_alloc(cpu);
@@ -312,18 +313,18 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
}
- if (amd_uncore_l2) {
- uncore_l2 = amd_uncore_alloc(cpu);
- if (!uncore_l2)
+ if (amd_uncore_llc) {
+ uncore_llc = amd_uncore_alloc(cpu);
+ if (!uncore_llc)
goto fail;
- uncore_l2->cpu = cpu;
- uncore_l2->num_counters = NUM_COUNTERS_L2;
- uncore_l2->rdpmc_base = RDPMC_BASE_L2;
- uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
- uncore_l2->active_mask = &amd_l2_active_mask;
- uncore_l2->pmu = &amd_l2_pmu;
- uncore_l2->id = -1;
- *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
+ uncore_llc->cpu = cpu;
+ uncore_llc->num_counters = NUM_COUNTERS_L2;
+ uncore_llc->rdpmc_base = RDPMC_BASE_LLC;
+ uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL;
+ uncore_llc->active_mask = &amd_llc_active_mask;
+ uncore_llc->pmu = &amd_llc_pmu;
+ uncore_llc->id = -1;
+ *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc;
}
return 0;
@@ -376,17 +377,12 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
}
- if (amd_uncore_l2) {
- unsigned int apicid = cpu_data(cpu).apicid;
- unsigned int nshared;
-
- uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
- cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
- nshared = ((eax >> 14) & 0xfff) + 1;
- uncore->id = apicid - (apicid % nshared);
+ if (amd_uncore_llc) {
+ uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
+ uncore->id = per_cpu(cpu_llc_id, cpu);
- uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
- *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+ uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
+ *per_cpu_ptr(amd_uncore_llc, cpu) = uncore;
}
return 0;
@@ -419,8 +415,8 @@ static int amd_uncore_cpu_online(unsigned int cpu)
if (amd_uncore_nb)
uncore_online(cpu, amd_uncore_nb);
- if (amd_uncore_l2)
- uncore_online(cpu, amd_uncore_l2);
+ if (amd_uncore_llc)
+ uncore_online(cpu, amd_uncore_llc);
return 0;
}
@@ -456,8 +452,8 @@ static int amd_uncore_cpu_down_prepare(unsigned int cpu)
if (amd_uncore_nb)
uncore_down_prepare(cpu, amd_uncore_nb);
- if (amd_uncore_l2)
- uncore_down_prepare(cpu, amd_uncore_l2);
+ if (amd_uncore_llc)
+ uncore_down_prepare(cpu, amd_uncore_llc);
return 0;
}
@@ -479,8 +475,8 @@ static int amd_uncore_cpu_dead(unsigned int cpu)
if (amd_uncore_nb)
uncore_dead(cpu, amd_uncore_nb);
- if (amd_uncore_l2)
- uncore_dead(cpu, amd_uncore_l2);
+ if (amd_uncore_llc)
+ uncore_dead(cpu, amd_uncore_llc);
return 0;
}
@@ -510,16 +506,16 @@ static int __init amd_uncore_init(void)
}
if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
- amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
- if (!amd_uncore_l2) {
+ amd_uncore_llc = alloc_percpu(struct amd_uncore *);
+ if (!amd_uncore_llc) {
ret = -ENOMEM;
- goto fail_l2;
+ goto fail_llc;
}
- ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+ ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1);
if (ret)
- goto fail_l2;
+ goto fail_llc;
- pr_info("perf: AMD L2I counters detected\n");
+ pr_info("perf: AMD LLC counters detected\n");
ret = 0;
}
@@ -529,7 +525,7 @@ static int __init amd_uncore_init(void)
if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP,
"PERF_X86_AMD_UNCORE_PREP",
amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead))
- goto fail_l2;
+ goto fail_llc;
if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
"AP_PERF_X86_AMD_UNCORE_STARTING",
@@ -546,11 +542,11 @@ fail_start:
cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
fail_prep:
cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
-fail_l2:
+fail_llc:
if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
perf_pmu_unregister(&amd_nb_pmu);
- if (amd_uncore_l2)
- free_percpu(amd_uncore_l2);
+ if (amd_uncore_llc)
+ free_percpu(amd_uncore_llc);
fail_nb:
if (amd_uncore_nb)
free_percpu(amd_uncore_nb);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 1e9f610d36a4..c26cca506f64 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -374,7 +374,7 @@ int x86_add_exclusive(unsigned int what)
* LBR and BTS are still mutually exclusive.
*/
if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
- return 0;
+ goto out;
if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
mutex_lock(&pmc_reserve_mutex);
@@ -386,6 +386,7 @@ int x86_add_exclusive(unsigned int what)
mutex_unlock(&pmc_reserve_mutex);
}
+out:
atomic_inc(&active_events);
return 0;
@@ -396,11 +397,15 @@ fail_unlock:
void x86_del_exclusive(unsigned int what)
{
+ atomic_dec(&active_events);
+
+ /*
+ * See the comment in x86_add_exclusive().
+ */
if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
return;
atomic_dec(&x86_pmu.lbr_exclusive[what]);
- atomic_dec(&active_events);
}
int x86_setup_perfctr(struct perf_event *event)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 098ab775135f..55e362f9dbfa 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2867,7 +2867,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
return ret;
if (event->attr.precise_ip) {
- if (!event->attr.freq) {
+ if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
if (!(event->attr.sample_type &
~intel_pmu_free_running_flags(event)))
@@ -3075,6 +3075,11 @@ static u64 bdw_limit_period(struct perf_event *event, u64 left)
return left;
}
+static u64 nhm_limit_period(struct perf_event *event, u64 left)
+{
+ return max(left, 32ULL);
+}
+
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
@@ -3734,6 +3739,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.extra_regs = intel_nehalem_extra_regs;
+ x86_pmu.limit_period = nhm_limit_period;
x86_pmu.cpu_events = nhm_events_attrs;
@@ -3750,11 +3756,11 @@ __init int intel_pmu_init(void)
pr_cont("Nehalem events, ");
break;
- case INTEL_FAM6_ATOM_PINEVIEW:
- case INTEL_FAM6_ATOM_LINCROFT:
- case INTEL_FAM6_ATOM_PENWELL:
- case INTEL_FAM6_ATOM_CLOVERVIEW:
- case INTEL_FAM6_ATOM_CEDARVIEW:
+ case INTEL_FAM6_ATOM_BONNELL:
+ case INTEL_FAM6_ATOM_BONNELL_MID:
+ case INTEL_FAM6_ATOM_SALTWELL:
+ case INTEL_FAM6_ATOM_SALTWELL_MID:
+ case INTEL_FAM6_ATOM_SALTWELL_TABLET:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -3766,9 +3772,11 @@ __init int intel_pmu_init(void)
pr_cont("Atom events, ");
break;
- case INTEL_FAM6_ATOM_SILVERMONT1:
- case INTEL_FAM6_ATOM_SILVERMONT2:
+ case INTEL_FAM6_ATOM_SILVERMONT:
+ case INTEL_FAM6_ATOM_SILVERMONT_X:
+ case INTEL_FAM6_ATOM_SILVERMONT_MID:
case INTEL_FAM6_ATOM_AIRMONT:
+ case INTEL_FAM6_ATOM_AIRMONT_MID:
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
@@ -3785,7 +3793,7 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_DENVERTON:
+ case INTEL_FAM6_ATOM_GOLDMONT_X:
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 47d526c700a1..72d09340c24d 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -531,8 +531,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates),
- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates),
- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates),
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index f26e26e4d84f..f562ddbeb20c 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -655,7 +655,7 @@ struct event_constraint intel_core2_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01),
EVENT_CONSTRAINT_END
};
@@ -664,7 +664,7 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01),
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
EVENT_CONSTRAINT_END
@@ -672,7 +672,7 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
struct event_constraint intel_slm_pebs_event_constraints[] = {
/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x1),
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
EVENT_CONSTRAINT_END
@@ -697,7 +697,7 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = {
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
EVENT_CONSTRAINT_END
};
@@ -714,7 +714,7 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = {
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
EVENT_CONSTRAINT_END
};
@@ -723,7 +723,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
@@ -738,9 +738,9 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
@@ -754,9 +754,9 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
@@ -777,9 +777,9 @@ struct event_constraint intel_bdw_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
@@ -800,9 +800,9 @@ struct event_constraint intel_bdw_pebs_event_constraints[] = {
struct event_constraint intel_skl_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
/* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
@@ -1326,6 +1326,8 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
old = ((s64)(prev_raw_count << shift) >> shift);
local64_add(new - old + count * period, &event->count);
+ local64_set(&hwc->period_left, -new);
+
perf_event_update_userpage(event);
return 0;
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index be0b1968d60a..68144a341903 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -61,8 +61,8 @@ static bool test_intel(int idx)
case INTEL_FAM6_BROADWELL_GT3E:
case INTEL_FAM6_BROADWELL_X:
- case INTEL_FAM6_ATOM_SILVERMONT1:
- case INTEL_FAM6_ATOM_SILVERMONT2:
+ case INTEL_FAM6_ATOM_SILVERMONT:
+ case INTEL_FAM6_ATOM_SILVERMONT_X:
case INTEL_FAM6_ATOM_AIRMONT:
if (idx == PERF_MSR_SMI)
return true;
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index cb13c0564ea7..9978ea4382bf 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -60,9 +60,8 @@
} while (0)
#define RELOAD_SEG(seg) { \
- unsigned int pre = GET_SEG(seg); \
+ unsigned int pre = (seg) | 3; \
unsigned int cur = get_user_seg(seg); \
- pre |= 3; \
if (pre != cur) \
set_user_seg(seg, pre); \
}
@@ -71,6 +70,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
struct sigcontext_32 __user *sc)
{
unsigned int tmpflags, err = 0;
+ u16 gs, fs, es, ds;
void __user *buf;
u32 tmp;
@@ -78,16 +78,10 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
current->restart_block.fn = do_no_restart_syscall;
get_user_try {
- /*
- * Reload fs and gs if they have changed in the signal
- * handler. This does not handle long fs/gs base changes in
- * the handler, but does not clobber them at least in the
- * normal case.
- */
- RELOAD_SEG(gs);
- RELOAD_SEG(fs);
- RELOAD_SEG(ds);
- RELOAD_SEG(es);
+ gs = GET_SEG(gs);
+ fs = GET_SEG(fs);
+ ds = GET_SEG(ds);
+ es = GET_SEG(es);
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
COPY(dx); COPY(cx); COPY(ip); COPY(ax);
@@ -105,6 +99,17 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
buf = compat_ptr(tmp);
} get_user_catch(err);
+ /*
+ * Reload fs and gs if they have changed in the signal
+ * handler. This does not handle long fs/gs base changes in
+ * the handler, but does not clobber them at least in the
+ * normal case.
+ */
+ RELOAD_SEG(gs);
+ RELOAD_SEG(fs);
+ RELOAD_SEG(ds);
+ RELOAD_SEG(es);
+
err |= fpu__restore_sig(buf, 1);
force_iret();
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 2188b5af8167..f39fd349cef6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -50,7 +50,7 @@ static inline void generic_apic_probe(void)
#ifdef CONFIG_X86_LOCAL_APIC
-extern unsigned int apic_verbosity;
+extern int apic_verbosity;
extern int local_apic_timer_c2_ok;
extern int disable_apic;
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 14635c5ea025..305c6eed9141 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -49,7 +49,7 @@ static __always_inline void atomic_add(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "addl %1,%0"
: "+m" (v->counter)
- : "ir" (i));
+ : "ir" (i) : "memory");
}
/**
@@ -63,7 +63,7 @@ static __always_inline void atomic_sub(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "subl %1,%0"
: "+m" (v->counter)
- : "ir" (i));
+ : "ir" (i) : "memory");
}
/**
@@ -89,7 +89,7 @@ static __always_inline bool atomic_sub_and_test(int i, atomic_t *v)
static __always_inline void atomic_inc(atomic_t *v)
{
asm volatile(LOCK_PREFIX "incl %0"
- : "+m" (v->counter));
+ : "+m" (v->counter) :: "memory");
}
/**
@@ -101,7 +101,7 @@ static __always_inline void atomic_inc(atomic_t *v)
static __always_inline void atomic_dec(atomic_t *v)
{
asm volatile(LOCK_PREFIX "decl %0"
- : "+m" (v->counter));
+ : "+m" (v->counter) :: "memory");
}
/**
@@ -249,19 +249,6 @@ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
return c;
}
-/**
- * atomic_inc_short - increment of a short integer
- * @v: pointer to type int
- *
- * Atomically adds 1 to @v
- * Returns the new value of @u
- */
-static __always_inline short int atomic_inc_short(short int *v)
-{
- asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
- return *v;
-}
-
#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 89ed2f6ae2f7..a3248402c36b 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -44,7 +44,7 @@ static __always_inline void atomic64_add(long i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "addq %1,%0"
: "=m" (v->counter)
- : "er" (i), "m" (v->counter));
+ : "er" (i), "m" (v->counter) : "memory");
}
/**
@@ -58,7 +58,7 @@ static inline void atomic64_sub(long i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "subq %1,%0"
: "=m" (v->counter)
- : "er" (i), "m" (v->counter));
+ : "er" (i), "m" (v->counter) : "memory");
}
/**
@@ -85,7 +85,7 @@ static __always_inline void atomic64_inc(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "incq %0"
: "=m" (v->counter)
- : "m" (v->counter));
+ : "m" (v->counter) : "memory");
}
/**
@@ -98,7 +98,7 @@ static __always_inline void atomic64_dec(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "decq %0"
: "=m" (v->counter)
- : "m" (v->counter));
+ : "m" (v->counter) : "memory");
}
/**
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index eb53c2c78a1f..a0f450b21d67 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -105,8 +105,8 @@ do { \
#endif
/* Atomic operations are already serializing on x86 */
-#define __smp_mb__before_atomic() barrier()
-#define __smp_mb__after_atomic() barrier()
+#define __smp_mb__before_atomic() do { } while (0)
+#define __smp_mb__after_atomic() do { } while (0)
#include <asm-generic/barrier.h>
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
index 4a8cb8d7cbd5..588d8fbd1e6d 100644
--- a/arch/x86/include/asm/bootparam_utils.h
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -17,6 +17,20 @@
* Note: efi_info is commonly left uninitialized, but that field has a
* private magic, so it is better to leave it unchanged.
*/
+
+#define sizeof_mbr(type, member) ({ sizeof(((type *)0)->member); })
+
+#define BOOT_PARAM_PRESERVE(struct_member) \
+ { \
+ .start = offsetof(struct boot_params, struct_member), \
+ .len = sizeof_mbr(struct boot_params, struct_member), \
+ }
+
+struct boot_params_to_save {
+ unsigned int start;
+ unsigned int len;
+};
+
static void sanitize_boot_params(struct boot_params *boot_params)
{
/*
@@ -35,19 +49,40 @@ static void sanitize_boot_params(struct boot_params *boot_params)
*/
if (boot_params->sentinel) {
/* fields in boot_params are left uninitialized, clear them */
- memset(&boot_params->ext_ramdisk_image, 0,
- (char *)&boot_params->efi_info -
- (char *)&boot_params->ext_ramdisk_image);
- memset(&boot_params->kbd_status, 0,
- (char *)&boot_params->hdr -
- (char *)&boot_params->kbd_status);
- memset(&boot_params->_pad7[0], 0,
- (char *)&boot_params->edd_mbr_sig_buffer[0] -
- (char *)&boot_params->_pad7[0]);
- memset(&boot_params->_pad8[0], 0,
- (char *)&boot_params->eddbuf[0] -
- (char *)&boot_params->_pad8[0]);
- memset(&boot_params->_pad9[0], 0, sizeof(boot_params->_pad9));
+ static struct boot_params scratch;
+ char *bp_base = (char *)boot_params;
+ char *save_base = (char *)&scratch;
+ int i;
+
+ const struct boot_params_to_save to_save[] = {
+ BOOT_PARAM_PRESERVE(screen_info),
+ BOOT_PARAM_PRESERVE(apm_bios_info),
+ BOOT_PARAM_PRESERVE(tboot_addr),
+ BOOT_PARAM_PRESERVE(ist_info),
+ BOOT_PARAM_PRESERVE(hd0_info),
+ BOOT_PARAM_PRESERVE(hd1_info),
+ BOOT_PARAM_PRESERVE(sys_desc_table),
+ BOOT_PARAM_PRESERVE(olpc_ofw_header),
+ BOOT_PARAM_PRESERVE(efi_info),
+ BOOT_PARAM_PRESERVE(alt_mem_k),
+ BOOT_PARAM_PRESERVE(scratch),
+ BOOT_PARAM_PRESERVE(e820_entries),
+ BOOT_PARAM_PRESERVE(eddbuf_entries),
+ BOOT_PARAM_PRESERVE(edd_mbr_sig_buf_entries),
+ BOOT_PARAM_PRESERVE(edd_mbr_sig_buffer),
+ BOOT_PARAM_PRESERVE(hdr),
+ BOOT_PARAM_PRESERVE(e820_map),
+ BOOT_PARAM_PRESERVE(eddbuf),
+ };
+
+ memset(&scratch, 0, sizeof(scratch));
+
+ for (i = 0; i < ARRAY_SIZE(to_save); i++) {
+ memcpy(save_base + to_save[i].start,
+ bp_base + to_save[i].start, to_save[i].len);
+ }
+
+ memcpy(boot_params, save_base, sizeof(*boot_params));
}
}
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 98444b77fbe3..fb457ba8ccc6 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -192,7 +192,8 @@
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-
+#define X86_FEATURE_FENCE_SWAPGS_USER ( 7*32+10) /* "" LFENCE in user entry SWAPGS path */
+#define X86_FEATURE_FENCE_SWAPGS_KERNEL ( 7*32+11) /* "" LFENCE in kernel entry SWAPGS path */
#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
@@ -201,9 +202,6 @@
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
-/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
-#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
-
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
@@ -214,6 +212,7 @@
#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
+#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -271,10 +270,12 @@
/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
-#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
-#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
-#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
+#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
@@ -304,6 +305,7 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */
/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
@@ -315,6 +317,7 @@
#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
+#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
@@ -352,5 +355,10 @@
#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
+#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
+#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
+#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
+#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
+#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h
index f498411f2500..1b15304dd098 100644
--- a/arch/x86/include/asm/crash.h
+++ b/arch/x86/include/asm/crash.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_CRASH_H
#define _ASM_X86_CRASH_H
+struct kimage;
+
int crash_load_segments(struct kimage *image);
int crash_copy_backup_region(struct kimage *image);
int crash_setup_memmap_entries(struct kimage *image,
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 8554f960e21b..61d6f2c05757 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -142,7 +142,7 @@ extern pte_t *kmap_pte;
extern pte_t *pkmap_page_table;
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
-void native_set_fixmap(enum fixed_addresses idx,
+void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
phys_addr_t phys, pgprot_t flags);
#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index b3e32b010ab1..c2c01f84df75 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -208,4 +208,22 @@ static inline int insn_offset_immediate(struct insn *insn)
return insn_offset_displacement(insn) + insn->displacement.nbytes;
}
+#define POP_SS_OPCODE 0x1f
+#define MOV_SREG_OPCODE 0x8e
+
+/*
+ * Intel SDM Vol.3A 6.8.3 states;
+ * "Any single-step trap that would be delivered following the MOV to SS
+ * instruction or POP to SS instruction (because EFLAGS.TF is 1) is
+ * suppressed."
+ * This function returns true if @insn is MOV SS or POP SS. On these
+ * instructions, single stepping is suppressed.
+ */
+static inline int insn_masking_exception(struct insn *insn)
+{
+ return insn->opcode.bytes[0] == POP_SS_OPCODE ||
+ (insn->opcode.bytes[0] == MOV_SREG_OPCODE &&
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 2);
+}
+
#endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 75b748a1deb8..74ee597beb3e 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -5,7 +5,7 @@
* "Big Core" Processors (Branded as Core, Xeon, etc...)
*
* The "_X" parts are generally the EP and EX Xeons, or the
- * "Extreme" ones, like Broadwell-E.
+ * "Extreme" ones, like Broadwell-E, or Atom microserver.
*
* Things ending in "2" are usually because we have no better
* name for them. There's no processor called "SILVERMONT2".
@@ -50,19 +50,24 @@
/* "Small Core" Processors (Atom) */
-#define INTEL_FAM6_ATOM_PINEVIEW 0x1C
-#define INTEL_FAM6_ATOM_LINCROFT 0x26
-#define INTEL_FAM6_ATOM_PENWELL 0x27
-#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35
-#define INTEL_FAM6_ATOM_CEDARVIEW 0x36
-#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */
-#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */
-#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */
-#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */
-#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */
-#define INTEL_FAM6_ATOM_GOLDMONT 0x5C
-#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */
-#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A
+#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
+#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
+
+#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */
+#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */
+#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */
+
+#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */
+#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */
+#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */
+
+#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */
+#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */
+
+#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
+#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
+#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
+#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */
/* Xeon Phi */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 508a062e6cf1..0c8f4281b151 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -5,6 +5,8 @@
#ifndef __ASSEMBLY__
+#include <asm/nospec-branch.h>
+
/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
#define __cpuidle __attribute__((__section__(".cpuidle.text")))
@@ -53,11 +55,13 @@ static inline void native_irq_enable(void)
static inline __cpuidle void native_safe_halt(void)
{
+ mds_idle_clear_cpu_buffers();
asm volatile("sti; hlt": : :"memory");
}
static inline __cpuidle void native_halt(void)
{
+ mds_idle_clear_cpu_buffers();
asm volatile("hlt": : :"memory");
}
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 282630e4c6ea..1624a7ffa95d 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -66,7 +66,7 @@ struct kimage;
/* Memory to backup during crash kdump */
#define KEXEC_BACKUP_SRC_START (0UL)
-#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
+#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
/*
* CPU does not save ss and sp on stack if execution is already
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9a8167b175d5..d2c14a96ec28 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -261,6 +261,7 @@ struct kvm_rmap_head {
struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
+ struct list_head lpage_disallowed_link;
/*
* The following two entries are used to key the shadow page in the
@@ -273,6 +274,7 @@ struct kvm_mmu_page {
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
bool unsync;
+ bool lpage_disallowed; /* Can't be replaced by an equiv large page */
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
@@ -487,6 +489,7 @@ struct kvm_vcpu_arch {
bool tpr_access_reporting;
u64 ia32_xss;
u64 microcode_version;
+ u64 arch_capabilities;
/*
* Paging state of the vcpu
@@ -723,6 +726,7 @@ struct kvm_arch {
*/
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
+ struct list_head lpage_disallowed_mmu_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
@@ -797,6 +801,8 @@ struct kvm_arch {
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
+
+ struct task_struct *nx_lpage_recovery_thread;
};
struct kvm_vm_stat {
@@ -810,6 +816,7 @@ struct kvm_vm_stat {
ulong mmu_unsync;
ulong remote_tlb_flush;
ulong lpages;
+ ulong nx_lpage_splits;
};
struct kvm_vcpu_stat {
@@ -1308,25 +1315,29 @@ enum {
#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
+asmlinkage void __noreturn kvm_spurious_fault(void);
+
/*
* Hardware virtualization extension instructions may fault if a
* reboot turns off virtualization while processes are running.
- * Trap the fault and ignore the instruction if that happens.
+ * Usually after catching the fault we just panic; during reboot
+ * instead the instruction is ignored.
*/
-asmlinkage void kvm_spurious_fault(void);
-
-#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
- "666: " insn "\n\t" \
- "668: \n\t" \
- ".pushsection .fixup, \"ax\" \n" \
- "667: \n\t" \
- cleanup_insn "\n\t" \
- "cmpb $0, kvm_rebooting \n\t" \
- "jne 668b \n\t" \
- __ASM_SIZE(push) " $666b \n\t" \
- "jmp kvm_spurious_fault \n\t" \
- ".popsection \n\t" \
- _ASM_EXTABLE(666b, 667b)
+#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
+ "666: \n\t" \
+ insn "\n\t" \
+ "jmp 668f \n\t" \
+ "667: \n\t" \
+ "call kvm_spurious_fault \n\t" \
+ "668: \n\t" \
+ ".pushsection .fixup, \"ax\" \n\t" \
+ "700: \n\t" \
+ cleanup_insn "\n\t" \
+ "cmpb $0, kvm_rebooting\n\t" \
+ "je 667b \n\t" \
+ "jmp 668b \n\t" \
+ ".popsection \n\t" \
+ _ASM_EXTABLE(666b, 700b)
#define __kvm_handle_fault_on_reboot(insn) \
____kvm_handle_fault_on_reboot(insn, "")
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 5e69154c9f07..c8e472e2c896 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -52,6 +52,21 @@ struct extended_sigtable {
#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+static inline u32 intel_get_microcode_revision(void)
+{
+ u32 rev, dummy;
+
+ native_wrmsrl(MSR_IA32_UCODE_REV, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ native_cpuid_eax(1);
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev);
+
+ return rev;
+}
+
extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev);
extern int microcode_sanity_check(void *mc, int print_err);
extern int find_matching_signature(void *mc, unsigned int csig, int cpf);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 9963e21ac443..8d162e0f2881 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_MSR_INDEX_H
#define _ASM_X86_MSR_INDEX_H
+#include <linux/bits.h>
+
/*
* CPU model specific register (MSR) numbers.
*
@@ -38,13 +40,14 @@
/* Intel MSRs. Some also available on other CPUs */
#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
-#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
-#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
+#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */
+#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */
+#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
-#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
+#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
-#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
+#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
#define MSR_IA32_PERFCTR0 0x000000c1
#define MSR_IA32_PERFCTR1 0x000000c2
@@ -61,24 +64,45 @@
#define MSR_MTRRcap 0x000000fe
#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
-#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
-#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
-#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */
-#define ARCH_CAP_SSB_NO (1 << 4) /*
- * Not susceptible to Speculative Store Bypass
- * attack, so no Speculative Store Bypass
- * control required.
- */
+#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */
+#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */
+#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */
+#define ARCH_CAP_SSB_NO BIT(4) /*
+ * Not susceptible to Speculative Store Bypass
+ * attack, so no Speculative Store Bypass
+ * control required.
+ */
+#define ARCH_CAP_MDS_NO BIT(5) /*
+ * Not susceptible to
+ * Microarchitectural Data
+ * Sampling (MDS) vulnerabilities.
+ */
+#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /*
+ * The processor is not susceptible to a
+ * machine check error due to modifying the
+ * code page size along with either the
+ * physical address or cache type
+ * without TLB invalidation.
+ */
+#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */
+#define ARCH_CAP_TAA_NO BIT(8) /*
+ * Not susceptible to
+ * TSX Async Abort (TAA) vulnerabilities.
+ */
#define MSR_IA32_FLUSH_CMD 0x0000010b
-#define L1D_FLUSH (1 << 0) /*
- * Writeback and invalidate the
- * L1 data cache.
- */
+#define L1D_FLUSH BIT(0) /*
+ * Writeback and invalidate the
+ * L1 data cache.
+ */
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
+#define MSR_IA32_TSX_CTRL 0x00000122
+#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
+#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */
+
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
#define MSR_IA32_SYSENTER_EIP 0x00000176
@@ -305,6 +329,7 @@
#define MSR_AMD64_PATCH_LEVEL 0x0000008b
#define MSR_AMD64_TSC_RATIO 0xc0000104
#define MSR_AMD64_NB_CFG 0xc001001f
+#define MSR_AMD64_CPUID_FN_1 0xc0011004
#define MSR_AMD64_PATCH_LOADER 0xc0010020
#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
#define MSR_AMD64_OSVW_STATUS 0xc0010141
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index f37f2d8a2989..58b1b766e84e 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <asm/cpufeature.h>
+#include <asm/nospec-branch.h>
#define MWAIT_SUBSTATE_MASK 0xf
#define MWAIT_CSTATE_MASK 0xf
@@ -18,7 +19,7 @@
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
#define MWAITX_ECX_TIMER_ENABLE BIT(1)
#define MWAITX_MAX_LOOPS ((u32)-1)
-#define MWAITX_DISABLE_CSTATES 0xf
+#define MWAITX_DISABLE_CSTATES 0xf0
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
@@ -38,6 +39,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
static inline void __mwait(unsigned long eax, unsigned long ecx)
{
+ mds_idle_clear_cpu_buffers();
+
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
@@ -72,6 +75,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
static inline void __mwaitx(unsigned long eax, unsigned long ebx,
unsigned long ecx)
{
+ /* No MDS buffer clear as this is AMD/HYGON only */
+
/* "mwaitx %eax, %ebx, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xfb;"
:: "a" (eax), "b" (ebx), "c" (ecx));
@@ -79,6 +84,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
+ mds_idle_clear_cpu_buffers();
+
trace_hardirqs_on();
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 1b4132161c1f..8d56d701b5f7 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -3,6 +3,8 @@
#ifndef _ASM_X86_NOSPEC_BRANCH_H_
#define _ASM_X86_NOSPEC_BRANCH_H_
+#include <linux/static_key.h>
+
#include <asm/alternative.h>
#include <asm/alternative-asm.h>
#include <asm/cpufeatures.h>
@@ -194,7 +196,7 @@
" lfence;\n" \
" jmp 902b;\n" \
" .align 16\n" \
- "903: addl $4, %%esp;\n" \
+ "903: lea 4(%%esp), %%esp;\n" \
" pushl %[thunk_target];\n" \
" ret;\n" \
" .align 16\n" \
@@ -214,10 +216,17 @@ enum spectre_v2_mitigation {
SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
SPECTRE_V2_RETPOLINE_GENERIC,
SPECTRE_V2_RETPOLINE_AMD,
- SPECTRE_V2_IBRS,
SPECTRE_V2_IBRS_ENHANCED,
};
+/* The indirect branch speculation control variants */
+enum spectre_v2_user_mitigation {
+ SPECTRE_V2_USER_NONE,
+ SPECTRE_V2_USER_STRICT,
+ SPECTRE_V2_USER_PRCTL,
+ SPECTRE_V2_USER_SECCOMP,
+};
+
/* The Speculative Store Bypass disable variants */
enum ssb_mitigation {
SPEC_STORE_BYPASS_NONE,
@@ -295,6 +304,60 @@ do { \
preempt_enable(); \
} while (0)
+DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
+DECLARE_STATIC_KEY_FALSE(mds_user_clear);
+DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
+
+#include <asm/segment.h>
+
+/**
+ * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
+ *
+ * This uses the otherwise unused and obsolete VERW instruction in
+ * combination with microcode which triggers a CPU buffer flush when the
+ * instruction is executed.
+ */
+static inline void mds_clear_cpu_buffers(void)
+{
+ static const u16 ds = __KERNEL_DS;
+
+ /*
+ * Has to be the memory-operand variant because only that
+ * guarantees the CPU buffer flush functionality according to
+ * documentation. The register-operand variant does not.
+ * Works with any segment selector, but a valid writable
+ * data segment is the fastest variant.
+ *
+ * "cc" clobber is required because VERW modifies ZF.
+ */
+ asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
+}
+
+/**
+ * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
+ *
+ * Clear CPU buffers if the corresponding static key is enabled
+ */
+static inline void mds_user_clear_cpu_buffers(void)
+{
+ if (static_branch_likely(&mds_user_clear))
+ mds_clear_cpu_buffers();
+}
+
+/**
+ * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
+ *
+ * Clear CPU buffers if the corresponding static key is enabled
+ */
+static inline void mds_idle_clear_cpu_buffers(void)
+{
+ if (static_branch_likely(&mds_idle_clear))
+ mds_clear_cpu_buffers();
+}
+
#endif /* __ASSEMBLY__ */
/*
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f353061bba1d..81d5ea71bbe9 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -200,16 +200,20 @@ struct x86_pmu_capability {
#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8)
#define IBSCTL_LVT_OFFSET_MASK 0x0F
-/* ibs fetch bits/masks */
+/* IBS fetch bits/masks */
#define IBS_FETCH_RAND_EN (1ULL<<57)
#define IBS_FETCH_VAL (1ULL<<49)
#define IBS_FETCH_ENABLE (1ULL<<48)
#define IBS_FETCH_CNT 0xFFFF0000ULL
#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
-/* ibs op bits/masks */
-/* lower 4 bits of the current count are ignored: */
-#define IBS_OP_CUR_CNT (0xFFFF0ULL<<32)
+/*
+ * IBS op bits/masks
+ * The lower 7 bits of the current count are random bits
+ * preloaded by hardware and ignored in software
+ */
+#define IBS_OP_CUR_CNT (0xFFF80ULL<<32)
+#define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32)
#define IBS_OP_CNT_CTL (1ULL<<19)
#define IBS_OP_VAL (1ULL<<18)
#define IBS_OP_ENABLE (1ULL<<17)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 221a32ed1372..f12e61e2a86b 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -44,15 +44,15 @@ struct mm_struct;
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
-static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
- *ptep = native_make_pte(0);
+ WRITE_ONCE(*ptep, pte);
}
-static inline void native_set_pte(pte_t *ptep, pte_t pte)
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
{
- *ptep = pte;
+ native_set_pte(ptep, native_make_pte(0));
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
@@ -62,7 +62,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
- *pmdp = pmd;
+ WRITE_ONCE(*pmdp, pmd);
}
static inline void native_pmd_clear(pmd_t *pmd)
@@ -98,7 +98,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
- *pudp = pud;
+ WRITE_ONCE(*pudp, pud);
}
static inline void native_pud_clear(pud_t *pud)
@@ -131,7 +131,7 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
+ WRITE_ONCE(*pgdp, kaiser_set_shadow_pgd(pgdp, pgd));
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ee8c6290c421..7aa9a9bd9d98 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -213,6 +213,24 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
: "memory");
}
+#define native_cpuid_reg(reg) \
+static inline unsigned int native_cpuid_##reg(unsigned int op) \
+{ \
+ unsigned int eax = op, ebx, ecx = 0, edx; \
+ \
+ native_cpuid(&eax, &ebx, &ecx, &edx); \
+ \
+ return reg; \
+}
+
+/*
+ * Native CPUID functions returning a single datum.
+ */
+native_cpuid_reg(eax)
+native_cpuid_reg(ebx)
+native_cpuid_reg(ecx)
+native_cpuid_reg(edx)
+
static inline void load_cr3(pgd_t *pgdir)
{
write_cr3(__pa(pgdir));
@@ -874,4 +892,17 @@ enum l1tf_mitigations {
extern enum l1tf_mitigations l1tf_mitigation;
+enum mds_mitigations {
+ MDS_MITIGATION_OFF,
+ MDS_MITIGATION_FULL,
+ MDS_MITIGATION_VMWERV,
+};
+
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2b5d686ea9f3..fb489cd848fa 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -115,9 +115,9 @@ static inline int v8086_mode(struct pt_regs *regs)
#endif
}
-#ifdef CONFIG_X86_64
static inline bool user_64bit_mode(struct pt_regs *regs)
{
+#ifdef CONFIG_X86_64
#ifndef CONFIG_PARAVIRT
/*
* On non-paravirt systems, this is the only long mode CPL 3
@@ -128,8 +128,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
/* Headers are too twisted for this to go in paravirt.h. */
return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
#endif
+#else /* !CONFIG_X86_64 */
+ return false;
+#endif
}
+#ifdef CONFIG_X86_64
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
#endif
@@ -196,23 +200,51 @@ static inline int regs_within_kernel_stack(struct pt_regs *regs,
}
/**
+ * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns the address of the @n th entry of the
+ * kernel stack which is specified by @regs. If the @n th entry is NOT in
+ * the kernel stack, this returns NULL.
+ */
+static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
+{
+ unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+
+ addr += n;
+ if (regs_within_kernel_stack(regs, (unsigned long)addr))
+ return addr;
+ else
+ return NULL;
+}
+
+/* To avoid include hell, we can't include uaccess.h */
+extern long probe_kernel_read(void *dst, const void *src, size_t size);
+
+/**
* regs_get_kernel_stack_nth() - get Nth entry of the stack
* @regs: pt_regs which contains kernel stack pointer.
* @n: stack entry number.
*
* regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
- * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack
* this returns 0.
*/
static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
unsigned int n)
{
- unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
- addr += n;
- if (regs_within_kernel_stack(regs, (unsigned long)addr))
- return *addr;
- else
- return 0;
+ unsigned long *addr;
+ unsigned long val;
+ long ret;
+
+ addr = regs_get_kernel_stack_nth_addr(regs, n);
+ if (addr) {
+ ret = probe_kernel_read(&val, addr, sizeof(val));
+ if (!ret)
+ return val;
+ }
+ return 0;
}
#define arch_has_single_step() (1)
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index d25fb6beb2f0..dcaf7100b69c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -177,16 +177,6 @@ extern int safe_smp_processor_id(void);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
-
-#ifndef CONFIG_X86_64
-static inline int logical_smp_processor_id(void)
-{
- /* we don't want to mark this access volatile - bad code generation */
- return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
-}
-
-#endif
-
extern int hard_smp_processor_id(void);
#else /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index ae7c2c5cd7f0..5393babc0598 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -53,12 +53,24 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
}
+static inline u64 stibp_tif_to_spec_ctrl(u64 tifn)
+{
+ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
+ return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
+}
+
static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
{
BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
}
+static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl)
+{
+ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
+ return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
+}
+
static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
{
return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
@@ -70,11 +82,7 @@ extern void speculative_store_bypass_ht_init(void);
static inline void speculative_store_bypass_ht_init(void) { }
#endif
-extern void speculative_store_bypass_update(unsigned long tif);
-
-static inline void speculative_store_bypass_update_current(void)
-{
- speculative_store_bypass_update(current_thread_info()->flags);
-}
+extern void speculation_ctrl_update(unsigned long tif);
+extern void speculation_ctrl_update_current(void);
#endif
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 37f2e0b377ad..4141ead86879 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -55,13 +55,16 @@ extern int kstack_depth_to_print;
static inline unsigned long *
get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
+ struct inactive_task_frame *frame;
+
if (regs)
return (unsigned long *)regs->bp;
if (task == current)
return __builtin_frame_address(0);
- return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp;
+ frame = (struct inactive_task_frame *)task->thread.sp;
+ return (unsigned long *)READ_ONCE_NOCHECK(frame->bp);
}
#else
static inline unsigned long *
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index 8e9dbe7b73a1..5cc2ce4ab8a3 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -11,7 +11,13 @@
/* image of the saved processor state */
struct saved_context {
- u16 es, fs, gs, ss;
+ /*
+ * On x86_32, all segment registers, with the possible exception of
+ * gs, are saved at kernel entry in pt_regs.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+ u16 gs;
+#endif
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
bool misc_enable_saved;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 2bd96b4df140..701751918921 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -19,8 +19,20 @@
*/
struct saved_context {
struct pt_regs regs;
- u16 ds, es, fs, gs, ss;
- unsigned long gs_base, gs_kernel_base, fs_base;
+
+ /*
+ * User CS and SS are saved in current_pt_regs(). The rest of the
+ * segment selectors need to be saved and restored here.
+ */
+ u16 ds, es, fs, gs;
+
+ /*
+ * Usermode FSBASE and GSBASE may not match the fs and gs selectors,
+ * so we save them separately. We save the kernelmode GSBASE to
+ * restore percpu access after resume.
+ */
+ unsigned long kernelmode_gs_base, usermode_gs_base, fs_base;
+
unsigned long cr0, cr2, cr3, cr4, cr8;
u64 misc_enable;
bool misc_enable_saved;
@@ -29,8 +41,7 @@ struct saved_context {
u16 gdt_pad; /* Unused */
struct desc_ptr gdt_desc;
u16 idt_pad;
- u16 idt_limit;
- unsigned long idt_base;
+ struct desc_ptr idt;
u16 ldt;
u16 tss;
unsigned long tr;
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 5cb436acd463..e959b8d40473 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -8,9 +8,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
-struct tss_struct;
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
- struct tss_struct *tss);
/* This runs runs on the previous thread's stack. */
static inline void prepare_switch_to(struct task_struct *prev,
@@ -38,6 +35,7 @@ asmlinkage void ret_from_fork(void);
/* data that is pointed to by thread.sp */
struct inactive_task_frame {
+ unsigned long flags;
#ifdef CONFIG_X86_64
unsigned long r15;
unsigned long r14;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 2d8788a59b4d..0438f7fbb383 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,10 +83,12 @@ struct thread_info {
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
-#define TIF_SSBD 5 /* Reduced data speculation */
+#define TIF_SSBD 5 /* Speculative store bypass disable */
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
+#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
@@ -111,6 +113,8 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
+#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_NOTSC (1 << TIF_NOTSC)
@@ -140,8 +144,18 @@ struct thread_info {
_TIF_NOHZ)
/* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW \
- (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD)
+#define _TIF_WORK_CTXSW_BASE \
+ (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP| \
+ _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
+
+/*
+ * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
+ */
+#ifdef CONFIG_SMP
+# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
+#else
+# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE)
+#endif
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 686a58d793e5..f5ca15622dc9 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -68,8 +68,12 @@ static inline void invpcid_flush_all_nonglobals(void)
struct tlb_state {
struct mm_struct *active_mm;
int state;
- /* last user mm's ctx id */
- u64 last_ctx_id;
+
+ /* Last user mm for optimizing IBPB */
+ union {
+ struct mm_struct *last_user_mm;
+ unsigned long last_user_mm_ibpb;
+ };
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 2177c7551ff7..9db8d8758ed3 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -438,8 +438,10 @@ do { \
({ \
int __gu_err; \
__inttype(*(ptr)) __gu_val; \
+ __typeof__(ptr) __gu_ptr = (ptr); \
+ __typeof__(size) __gu_size = (size); \
__uaccess_begin_nospec(); \
- __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
+ __get_user_size(__gu_val, __gu_ptr, __gu_size, __gu_err, -EFAULT); \
__uaccess_end(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
__builtin_expect(__gu_err, 0); \
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index e728699db774..59e78c3d3dd8 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -89,8 +89,13 @@ static inline unsigned int __getcpu(void)
* works on all CPUs. This is volatile so that it orders
* correctly wrt barrier() and to keep gcc from cleverly
* hoisting it out of the calling function.
+ *
+ * If RDPID is available, use it.
*/
- asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ alternative_io ("lsl %[seg],%[p]",
+ ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
+ X86_FEATURE_RDPID,
+ [p] "=a" (p), [seg] "r" (__PER_CPU_SEG));
return p;
}
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index ccdc23d89b60..9f694537a103 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -216,6 +216,9 @@ privcmd_call(unsigned call,
__HYPERCALL_DECLS;
__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
+ if (call >= PAGE_SIZE / sizeof(hypercall_page[0]))
+ return -EINVAL;
+
stac();
asm volatile(CALL_NOSPEC
: __HYPERCALL_5PARAM
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index 3dec769cadf7..1c532b3f18ea 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -27,7 +27,6 @@ header-y += ldt.h
header-y += mce.h
header-y += mman.h
header-y += msgbuf.h
-header-y += msr-index.h
header-y += msr.h
header-y += mtrr.h
header-y += param.h
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 69a6e07e3149..db7dae58745f 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -28,6 +28,8 @@ struct mce {
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
__u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
__u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
+ __u64 ppin; /* Protected Processor Inventory Number */
+ __u32 microcode;/* Microcode revision */
};
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0a1e8a67cc99..c3fba8b52753 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1717,7 +1717,7 @@ int __acpi_acquire_global_lock(unsigned int *lock)
new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
val = cmpxchg(lock, old, new);
} while (unlikely (val != old));
- return (new < 3) ? -1 : 0;
+ return ((new & 0x3) < 3) ? -1 : 0;
}
int __acpi_release_global_lock(unsigned int *lock)
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 169963f471bb..50b8ed0317a3 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel)
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
+#ifdef CONFIG_KASAN
+ /*
+ * The suspend path may have poisoned some areas deeper in the stack,
+ * which we now need to unpoison.
+ */
+ movq %rsp, %rdi
+ call kasan_unpoison_task_stack_below
+#endif
+
xorl %eax, %eax
addq $8, %rsp
FRAME_END
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4f2af1ee09cb..722a76b88bcc 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -183,7 +183,7 @@ int first_system_vector = FIRST_SYSTEM_VECTOR;
/*
* Debug level, exported for io_apic.c
*/
-unsigned int apic_verbosity;
+int apic_verbosity;
int pic_mode;
@@ -629,7 +629,7 @@ static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
/*
- * Temporary interrupt handler.
+ * Temporary interrupt handler and polled calibration function.
*/
static void __init lapic_cal_handler(struct clock_event_device *dev)
{
@@ -713,7 +713,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
- void (*real_handler)(struct clock_event_device *dev);
+ u64 tsc_perj = 0, tsc_start = 0;
+ unsigned long jif_start;
unsigned long deltaj;
long delta, deltatsc;
int pm_referenced = 0;
@@ -742,28 +743,64 @@ static int __init calibrate_APIC_clock(void)
apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
"calibrating APIC timer ...\n");
+ /*
+ * There are platforms w/o global clockevent devices. Instead of
+ * making the calibration conditional on that, use a polling based
+ * approach everywhere.
+ */
local_irq_disable();
- /* Replace the global interrupt handler */
- real_handler = global_clock_event->event_handler;
- global_clock_event->event_handler = lapic_cal_handler;
-
/*
* Setup the APIC counter to maximum. There is no way the lapic
* can underflow in the 100ms detection time frame
*/
__setup_APIC_LVTT(0xffffffff, 0, 0);
- /* Let the interrupts run */
+ /*
+ * Methods to terminate the calibration loop:
+ * 1) Global clockevent if available (jiffies)
+ * 2) TSC if available and frequency is known
+ */
+ jif_start = READ_ONCE(jiffies);
+
+ if (tsc_khz) {
+ tsc_start = rdtsc();
+ tsc_perj = div_u64((u64)tsc_khz * 1000, HZ);
+ }
+
+ /*
+ * Enable interrupts so the tick can fire, if a global
+ * clockevent device is available
+ */
local_irq_enable();
- while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
- cpu_relax();
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS) {
+ /* Wait for a tick to elapse */
+ while (1) {
+ if (tsc_khz) {
+ u64 tsc_now = rdtsc();
+ if ((tsc_now - tsc_start) >= tsc_perj) {
+ tsc_start += tsc_perj;
+ break;
+ }
+ } else {
+ unsigned long jif_now = READ_ONCE(jiffies);
- local_irq_disable();
+ if (time_after(jif_now, jif_start)) {
+ jif_start = jif_now;
+ break;
+ }
+ }
+ cpu_relax();
+ }
- /* Restore the real event handler */
- global_clock_event->event_handler = real_handler;
+ /* Invoke the calibration routine */
+ local_irq_disable();
+ lapic_cal_handler(NULL);
+ local_irq_enable();
+ }
+
+ local_irq_disable();
/* Build delta t1-t2 as apic timer counts down */
delta = lapic_cal_t1 - lapic_cal_t2;
@@ -814,10 +851,11 @@ static int __init calibrate_APIC_clock(void)
levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
/*
- * PM timer calibration failed or not turned on
- * so lets try APIC timer based calibration
+ * PM timer calibration failed or not turned on so lets try APIC
+ * timer based calibration, if a global clockevent device is
+ * available.
*/
- if (!pm_referenced) {
+ if (!pm_referenced && global_clock_event) {
apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
/*
@@ -1243,6 +1281,56 @@ static void lapic_setup_esr(void)
oldvalue, value);
}
+static void apic_pending_intr_clear(void)
+{
+ long long max_loops = cpu_khz ? cpu_khz : 1000000;
+ unsigned long long tsc = 0, ntsc;
+ unsigned int value, queued;
+ int i, j, acked = 0;
+
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ tsc = rdtsc();
+ /*
+ * After a crash, we no longer service the interrupts and a pending
+ * interrupt from previous kernel might still have ISR bit set.
+ *
+ * Most probably by now CPU has serviced that pending interrupt and
+ * it might not have done the ack_APIC_irq() because it thought,
+ * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
+ * does not clear the ISR bit and cpu thinks it has already serivced
+ * the interrupt. Hence a vector might get locked. It was noticed
+ * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+ */
+ do {
+ queued = 0;
+ for (i = APIC_ISR_NR - 1; i >= 0; i--)
+ queued |= apic_read(APIC_IRR + i*0x10);
+
+ for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+ value = apic_read(APIC_ISR + i*0x10);
+ for (j = 31; j >= 0; j--) {
+ if (value & (1<<j)) {
+ ack_APIC_irq();
+ acked++;
+ }
+ }
+ }
+ if (acked > 256) {
+ printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
+ acked);
+ break;
+ }
+ if (queued) {
+ if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) {
+ ntsc = rdtsc();
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ }
+ } while (queued && max_loops > 0);
+ WARN_ON(max_loops <= 0);
+}
+
/**
* setup_local_APIC - setup the local APIC
*
@@ -1252,19 +1340,22 @@ static void lapic_setup_esr(void)
void setup_local_APIC(void)
{
int cpu = smp_processor_id();
- unsigned int value, queued;
- int i, j, acked = 0;
- unsigned long long tsc = 0, ntsc;
- long long max_loops = cpu_khz ? cpu_khz : 1000000;
+ unsigned int value;
- if (boot_cpu_has(X86_FEATURE_TSC))
- tsc = rdtsc();
if (disable_apic) {
disable_ioapic_support();
return;
}
+ /*
+ * If this comes from kexec/kcrash the APIC might be enabled in
+ * SPIV. Soft disable it before doing further initialization.
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_SPIV_APIC_ENABLED;
+ apic_write(APIC_SPIV, value);
+
#ifdef CONFIG_X86_32
/* Pound the ESR really hard over the head with a big hammer - mbligh */
if (lapic_is_integrated() && apic->disable_esr) {
@@ -1290,16 +1381,21 @@ void setup_local_APIC(void)
apic->init_apic_ldr();
#ifdef CONFIG_X86_32
- /*
- * APIC LDR is initialized. If logical_apicid mapping was
- * initialized during get_smp_config(), make sure it matches the
- * actual value.
- */
- i = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
- WARN_ON(i != BAD_APICID && i != logical_smp_processor_id());
- /* always use the value from LDR */
- early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
- logical_smp_processor_id();
+ if (apic->dest_logical) {
+ int logical_apicid, ldr_apicid;
+
+ /*
+ * APIC LDR is initialized. If logical_apicid mapping was
+ * initialized during get_smp_config(), make sure it matches
+ * the actual value.
+ */
+ logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+ ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+ if (logical_apicid != BAD_APICID)
+ WARN_ON(logical_apicid != ldr_apicid);
+ /* Always use the value from LDR. */
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid;
+ }
#endif
/*
@@ -1310,45 +1406,7 @@ void setup_local_APIC(void)
value &= ~APIC_TPRI_MASK;
apic_write(APIC_TASKPRI, value);
- /*
- * After a crash, we no longer service the interrupts and a pending
- * interrupt from previous kernel might still have ISR bit set.
- *
- * Most probably by now CPU has serviced that pending interrupt and
- * it might not have done the ack_APIC_irq() because it thought,
- * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
- * does not clear the ISR bit and cpu thinks it has already serivced
- * the interrupt. Hence a vector might get locked. It was noticed
- * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
- */
- do {
- queued = 0;
- for (i = APIC_ISR_NR - 1; i >= 0; i--)
- queued |= apic_read(APIC_IRR + i*0x10);
-
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for (j = 31; j >= 0; j--) {
- if (value & (1<<j)) {
- ack_APIC_irq();
- acked++;
- }
- }
- }
- if (acked > 256) {
- printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
- acked);
- break;
- }
- if (queued) {
- if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) {
- ntsc = rdtsc();
- max_loops = (cpu_khz << 10) - (ntsc - tsc);
- } else
- max_loops--;
- }
- } while (queued && max_loops > 0);
- WARN_ON(max_loops <= 0);
+ apic_pending_intr_clear();
/*
* Now that we are all set up, enable the APIC
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 56012010332c..76fe153ccc6d 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -37,32 +37,12 @@ static int bigsmp_early_logical_apicid(int cpu)
return early_per_cpu(x86_cpu_to_apicid, cpu);
}
-static inline unsigned long calculate_ldr(int cpu)
-{
- unsigned long val, id;
-
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- id = per_cpu(x86_bios_cpu_apicid, cpu);
- val |= SET_APIC_LOGICAL_ID(id);
-
- return val;
-}
-
/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
+ * bigsmp enables physical destination mode
+ * and doesn't use LDR and DFR
*/
static void bigsmp_init_apic_ldr(void)
{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
}
static void bigsmp_setup_apic_routing(void)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d34629d70421..3401b28f1312 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1712,9 +1712,10 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
static inline bool ioapic_irqd_mask(struct irq_data *data)
{
- /* If we are moving the irq we need to mask it */
+ /* If we are moving the IRQ we need to mask it */
if (unlikely(irqd_is_setaffinity_pending(data))) {
- mask_ioapic_irq(data);
+ if (!irqd_irq_masked(data))
+ mask_ioapic_irq(data);
return true;
}
return false;
@@ -1751,7 +1752,9 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
*/
if (!io_apic_level_ack_pending(data->chip_data))
irq_move_masked_irq(data);
- unmask_ioapic_irq(data);
+ /* If the IRQ is masked in the core, leave it: */
+ if (!irqd_irq_masked(data))
+ unmask_ioapic_irq(data);
}
}
#else
@@ -2346,7 +2349,13 @@ unsigned int arch_dynirq_lower_bound(unsigned int from)
* dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
* gsi_top if ioapic_dynirq_base hasn't been initialized yet.
*/
- return ioapic_initialized ? ioapic_dynirq_base : gsi_top;
+ if (!ioapic_initialized)
+ return gsi_top;
+ /*
+ * For DT enabled machines ioapic_dynirq_base is irrelevant and not
+ * updated. So simply return @from if ioapic_dynirq_base == 0.
+ */
+ return ioapic_dynirq_base ? : from;
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 33b63670bf09..f6e386fe510c 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -25,7 +25,7 @@ obj-y += bugs.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
-obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
+obj-$(CONFIG_CPU_SUP_INTEL) += intel.o tsx.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index be6d0543e626..9428b54fff66 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -746,6 +746,64 @@ static void init_amd_ln(struct cpuinfo_x86 *c)
msr_set_bit(MSR_AMD64_DE_CFG, 31);
}
+static bool rdrand_force;
+
+static int __init rdrand_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "force"))
+ rdrand_force = true;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("rdrand", rdrand_cmdline);
+
+static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
+{
+ /*
+ * Saving of the MSR used to hide the RDRAND support during
+ * suspend/resume is done by arch/x86/power/cpu.c, which is
+ * dependent on CONFIG_PM_SLEEP.
+ */
+ if (!IS_ENABLED(CONFIG_PM_SLEEP))
+ return;
+
+ /*
+ * The nordrand option can clear X86_FEATURE_RDRAND, so check for
+ * RDRAND support using the CPUID function directly.
+ */
+ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
+ return;
+
+ msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62);
+
+ /*
+ * Verify that the CPUID change has occurred in case the kernel is
+ * running virtualized and the hypervisor doesn't support the MSR.
+ */
+ if (cpuid_ecx(1) & BIT(30)) {
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n");
+ return;
+ }
+
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n");
+}
+
+static void init_amd_jg(struct cpuinfo_x86 *c)
+{
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
+
static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
@@ -760,14 +818,24 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
wrmsrl_safe(MSR_F15H_IC_CFG, value);
}
}
+
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
}
static void init_amd_zn(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_ZEN);
- /* Fix erratum 1076: CPB feature bit not being set in CPUID. */
- if (!cpu_has(c, X86_FEATURE_CPB))
+ /*
+ * Fix erratum 1076: CPB feature bit not being set in CPUID.
+ * Always set it, except when running under a hypervisor.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB))
set_cpu_cap(c, X86_FEATURE_CPB);
}
@@ -801,6 +869,7 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x10: init_amd_gh(c); break;
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
+ case 0x16: init_amd_jg(c); break;
case 0x17: init_amd_zn(c); break;
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 6221166e3fca..24307d5bb4b8 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/nospec.h>
#include <linux/prctl.h>
+#include <linux/sched/smt.h>
#include <asm/spec-ctrl.h>
#include <asm/cmdline.h>
@@ -24,21 +25,26 @@
#include <asm/vmx.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
+#include <asm/hypervisor.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/intel-family.h>
#include <asm/e820.h>
+#include "cpu.h"
+
+static void __init spectre_v1_select_mitigation(void);
static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
+static void __init mds_select_mitigation(void);
+static void __init mds_print_mitigation(void);
+static void __init taa_select_mitigation(void);
-/*
- * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
- * writes to SPEC_CTRL contain whatever reserved bits have been set.
- */
-u64 __ro_after_init x86_spec_ctrl_base;
+/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
+u64 x86_spec_ctrl_base;
EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+static DEFINE_MUTEX(spec_ctrl_mutex);
/*
* The vendor and possibly platform specific bits which can be modified in
@@ -53,6 +59,20 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
u64 __ro_after_init x86_amd_ls_cfg_base;
u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
+/* Control conditional STIPB in switch_to() */
+DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+/* Control conditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+/* Control unconditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
+/* Control MDS CPU buffer clear before returning to user space */
+DEFINE_STATIC_KEY_FALSE(mds_user_clear);
+EXPORT_SYMBOL_GPL(mds_user_clear);
+/* Control MDS CPU buffer clear before idling (halt, mwait) */
+DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
+EXPORT_SYMBOL_GPL(mds_idle_clear);
+
void __init check_bugs(void)
{
identify_boot_cpu();
@@ -80,16 +100,21 @@ void __init check_bugs(void)
if (boot_cpu_has(X86_FEATURE_STIBP))
x86_spec_ctrl_mask |= SPEC_CTRL_STIBP;
- /* Select the proper spectre mitigation before patching alternatives */
+ /* Select the proper CPU mitigations before patching alternatives: */
+ spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
+ ssb_select_mitigation();
+ l1tf_select_mitigation();
+ mds_select_mitigation();
+ taa_select_mitigation();
/*
- * Select proper mitigation for any exposure to the Speculative Store
- * Bypass vulnerability.
+ * As MDS and TAA mitigations are inter-related, print MDS
+ * mitigation until after TAA mitigation selection is done.
*/
- ssb_select_mitigation();
+ mds_print_mitigation();
- l1tf_select_mitigation();
+ arch_smt_update();
#ifdef CONFIG_X86_32
/*
@@ -123,31 +148,6 @@ void __init check_bugs(void)
#endif
}
-/* The kernel command line selection */
-enum spectre_v2_mitigation_cmd {
- SPECTRE_V2_CMD_NONE,
- SPECTRE_V2_CMD_AUTO,
- SPECTRE_V2_CMD_FORCE,
- SPECTRE_V2_CMD_RETPOLINE,
- SPECTRE_V2_CMD_RETPOLINE_GENERIC,
- SPECTRE_V2_CMD_RETPOLINE_AMD,
-};
-
-static const char *spectre_v2_strings[] = {
- [SPECTRE_V2_NONE] = "Vulnerable",
- [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
- [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
- [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
- [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
- [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
-};
-
-#undef pr_fmt
-#define pr_fmt(fmt) "Spectre V2 : " fmt
-
-static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
- SPECTRE_V2_NONE;
-
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
@@ -165,9 +165,14 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
/* SSBD controlled in MSR_SPEC_CTRL */
- if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+ if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD))
hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
+ /* Conditional STIBP enabled? */
+ if (static_branch_unlikely(&switch_to_cond_stibp))
+ hostval |= stibp_tif_to_spec_ctrl(ti->flags);
+
if (hostval != guestval) {
msrval = setguest ? guestval : hostval;
wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
@@ -201,7 +206,7 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
ssbd_spec_ctrl_to_tif(hostval);
- speculative_store_bypass_update(tif);
+ speculation_ctrl_update(tif);
}
}
EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
@@ -216,6 +221,275 @@ static void x86_amd_ssb_disable(void)
wrmsrl(MSR_AMD64_LS_CFG, msrval);
}
+#undef pr_fmt
+#define pr_fmt(fmt) "MDS: " fmt
+
+/* Default mitigation for MDS-affected CPUs */
+static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+static bool mds_nosmt __ro_after_init = false;
+
+static const char * const mds_strings[] = {
+ [MDS_MITIGATION_OFF] = "Vulnerable",
+ [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+ [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+};
+
+static void __init mds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
+ mds_mitigation = MDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (mds_mitigation == MDS_MITIGATION_FULL) {
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ mds_mitigation = MDS_MITIGATION_VMWERV;
+
+ static_branch_enable(&mds_user_clear);
+
+ if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
+ (mds_nosmt || cpu_mitigations_auto_nosmt()))
+ cpu_smt_disable(false);
+ }
+}
+
+static void __init mds_print_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
+ return;
+
+ pr_info("%s\n", mds_strings[mds_mitigation]);
+}
+
+static int __init mds_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ mds_mitigation = MDS_MITIGATION_OFF;
+ else if (!strcmp(str, "full"))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else if (!strcmp(str, "full,nosmt")) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mds", mds_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "TAA: " fmt
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
+static bool taa_nosmt __ro_after_init;
+
+static const char * const taa_strings[] = {
+ [TAA_MITIGATION_OFF] = "Vulnerable",
+ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
+};
+
+static void __init taa_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /* TSX previously disabled by tsx=off */
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
+ goto out;
+ }
+
+ if (cpu_mitigations_off()) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /*
+ * TAA mitigation via VERW is turned off if both
+ * tsx_async_abort=off and mds=off are specified.
+ */
+ if (taa_mitigation == TAA_MITIGATION_OFF &&
+ mds_mitigation == MDS_MITIGATION_OFF)
+ goto out;
+
+ if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ ia32_cap = x86_read_arch_cap_msr();
+ if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
+ !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ static_branch_enable(&mds_user_clear);
+
+ if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+
+ /*
+ * Update MDS mitigation, if necessary, as the mds_user_clear is
+ * now enabled for TAA mitigation.
+ */
+ if (mds_mitigation == MDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_select_mitigation();
+ }
+out:
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static int __init tsx_async_abort_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V1 : " fmt
+
+enum spectre_v1_mitigation {
+ SPECTRE_V1_MITIGATION_NONE,
+ SPECTRE_V1_MITIGATION_AUTO,
+};
+
+static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init =
+ SPECTRE_V1_MITIGATION_AUTO;
+
+static const char * const spectre_v1_strings[] = {
+ [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers",
+ [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization",
+};
+
+/*
+ * Does SMAP provide full mitigation against speculative kernel access to
+ * userspace?
+ */
+static bool smap_works_speculatively(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
+ /*
+ * On CPUs which are vulnerable to Meltdown, SMAP does not
+ * prevent speculative access to user data in the L1 cache.
+ * Consider SMAP to be non-functional as a mitigation on these
+ * CPUs.
+ */
+ if (boot_cpu_has(X86_BUG_CPU_MELTDOWN))
+ return false;
+
+ return true;
+}
+
+static void __init spectre_v1_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) {
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+ return;
+ }
+
+ if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
+ /*
+ * With Spectre v1, a user can speculatively control either
+ * path of a conditional swapgs with a user-controlled GS
+ * value. The mitigation is to add lfences to both code paths.
+ *
+ * If FSGSBASE is enabled, the user can put a kernel address in
+ * GS, in which case SMAP provides no protection.
+ *
+ * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the
+ * FSGSBASE enablement patches have been merged. ]
+ *
+ * If FSGSBASE is disabled, the user can only put a user space
+ * address in GS. That makes an attack harder, but still
+ * possible if there's no SMAP protection.
+ */
+ if (!smap_works_speculatively()) {
+ /*
+ * Mitigation can be provided from SWAPGS itself or
+ * PTI as the CR3 write in the Meltdown mitigation
+ * is serializing.
+ *
+ * If neither is there, mitigate with an LFENCE to
+ * stop speculation through swapgs.
+ */
+ if (boot_cpu_has_bug(X86_BUG_SWAPGS) &&
+ !boot_cpu_has(X86_FEATURE_KAISER))
+ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER);
+
+ /*
+ * Enable lfences in the kernel entry (non-swapgs)
+ * paths, to prevent user entry from speculatively
+ * skipping swapgs.
+ */
+ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL);
+ }
+ }
+
+ pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+}
+
+static int __init nospectre_v1_cmdline(char *str)
+{
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+ return 0;
+}
+early_param("nospectre_v1", nospectre_v1_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V2 : " fmt
+
+static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
+ SPECTRE_V2_NONE;
+
+static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+
#ifdef RETPOLINE
static bool spectre_v2_bad_module;
@@ -237,67 +511,225 @@ static inline const char *spectre_v2_module_string(void)
static inline const char *spectre_v2_module_string(void) { return ""; }
#endif
-static void __init spec2_print_if_insecure(const char *reason)
+static inline bool match_option(const char *arg, int arglen, const char *opt)
{
- if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
- pr_info("%s selected on command line.\n", reason);
+ int len = strlen(opt);
+
+ return len == arglen && !strncmp(arg, opt, len);
}
-static void __init spec2_print_if_secure(const char *reason)
+/* The kernel command line selection for spectre v2 */
+enum spectre_v2_mitigation_cmd {
+ SPECTRE_V2_CMD_NONE,
+ SPECTRE_V2_CMD_AUTO,
+ SPECTRE_V2_CMD_FORCE,
+ SPECTRE_V2_CMD_RETPOLINE,
+ SPECTRE_V2_CMD_RETPOLINE_GENERIC,
+ SPECTRE_V2_CMD_RETPOLINE_AMD,
+};
+
+enum spectre_v2_user_cmd {
+ SPECTRE_V2_USER_CMD_NONE,
+ SPECTRE_V2_USER_CMD_AUTO,
+ SPECTRE_V2_USER_CMD_FORCE,
+ SPECTRE_V2_USER_CMD_PRCTL,
+ SPECTRE_V2_USER_CMD_PRCTL_IBPB,
+ SPECTRE_V2_USER_CMD_SECCOMP,
+ SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
+};
+
+static const char * const spectre_v2_user_strings[] = {
+ [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
+ [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
+ [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
+ [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
+};
+
+static const struct {
+ const char *option;
+ enum spectre_v2_user_cmd cmd;
+ bool secure;
+} v2_user_options[] __initconst = {
+ { "auto", SPECTRE_V2_USER_CMD_AUTO, false },
+ { "off", SPECTRE_V2_USER_CMD_NONE, false },
+ { "on", SPECTRE_V2_USER_CMD_FORCE, true },
+ { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false },
+ { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false },
+ { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false },
+ { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false },
+};
+
+static void __init spec_v2_user_print_cond(const char *reason, bool secure)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
- pr_info("%s selected on command line.\n", reason);
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
+ pr_info("spectre_v2_user=%s forced on command line.\n", reason);
}
-static inline bool retp_compiler(void)
+static enum spectre_v2_user_cmd __init
+spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
{
- return __is_defined(RETPOLINE);
+ char arg[20];
+ int ret, i;
+
+ switch (v2_cmd) {
+ case SPECTRE_V2_CMD_NONE:
+ return SPECTRE_V2_USER_CMD_NONE;
+ case SPECTRE_V2_CMD_FORCE:
+ return SPECTRE_V2_USER_CMD_FORCE;
+ default:
+ break;
+ }
+
+ ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
+ arg, sizeof(arg));
+ if (ret < 0)
+ return SPECTRE_V2_USER_CMD_AUTO;
+
+ for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
+ if (match_option(arg, ret, v2_user_options[i].option)) {
+ spec_v2_user_print_cond(v2_user_options[i].option,
+ v2_user_options[i].secure);
+ return v2_user_options[i].cmd;
+ }
+ }
+
+ pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
+ return SPECTRE_V2_USER_CMD_AUTO;
}
-static inline bool match_option(const char *arg, int arglen, const char *opt)
+static void __init
+spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
{
- int len = strlen(opt);
+ enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
+ bool smt_possible = IS_ENABLED(CONFIG_SMP);
+ enum spectre_v2_user_cmd cmd;
- return len == arglen && !strncmp(arg, opt, len);
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
+ cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ smt_possible = false;
+
+ cmd = spectre_v2_parse_user_cmdline(v2_cmd);
+ switch (cmd) {
+ case SPECTRE_V2_USER_CMD_NONE:
+ goto set_mode;
+ case SPECTRE_V2_USER_CMD_FORCE:
+ mode = SPECTRE_V2_USER_STRICT;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ mode = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_AUTO:
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ mode = SPECTRE_V2_USER_SECCOMP;
+ else
+ mode = SPECTRE_V2_USER_PRCTL;
+ break;
+ }
+
+ /* Initialize Indirect Branch Prediction Barrier */
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+
+ switch (cmd) {
+ case SPECTRE_V2_USER_CMD_FORCE:
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ static_branch_enable(&switch_mm_always_ibpb);
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ case SPECTRE_V2_USER_CMD_AUTO:
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ static_branch_enable(&switch_mm_cond_ibpb);
+ break;
+ default:
+ break;
+ }
+
+ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
+ static_key_enabled(&switch_mm_always_ibpb) ?
+ "always-on" : "conditional");
+ }
+
+ /* If enhanced IBRS is enabled no STIPB required */
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ return;
+
+ /*
+ * If SMT is not possible or STIBP is not available clear the STIPB
+ * mode.
+ */
+ if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP))
+ mode = SPECTRE_V2_USER_NONE;
+set_mode:
+ spectre_v2_user = mode;
+ /* Only print the STIBP mode when SMT possible */
+ if (smt_possible)
+ pr_info("%s\n", spectre_v2_user_strings[mode]);
}
+static const char * const spectre_v2_strings[] = {
+ [SPECTRE_V2_NONE] = "Vulnerable",
+ [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
+ [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
+ [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
+ [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
+ [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
+};
+
static const struct {
const char *option;
enum spectre_v2_mitigation_cmd cmd;
bool secure;
-} mitigation_options[] = {
- { "off", SPECTRE_V2_CMD_NONE, false },
- { "on", SPECTRE_V2_CMD_FORCE, true },
- { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
- { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
- { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
- { "auto", SPECTRE_V2_CMD_AUTO, false },
+} mitigation_options[] __initconst = {
+ { "off", SPECTRE_V2_CMD_NONE, false },
+ { "on", SPECTRE_V2_CMD_FORCE, true },
+ { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
+ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
+ { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+ { "auto", SPECTRE_V2_CMD_AUTO, false },
};
+static void __init spec_v2_print_cond(const char *reason, bool secure)
+{
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
+ pr_info("%s selected on command line.\n", reason);
+}
+
+static inline bool retp_compiler(void)
+{
+ return __is_defined(RETPOLINE);
+}
+
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
{
+ enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
char arg[20];
int ret, i;
- enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
- if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
+ cpu_mitigations_off())
return SPECTRE_V2_CMD_NONE;
- else {
- ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
- if (ret < 0)
- return SPECTRE_V2_CMD_AUTO;
- for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
- if (!match_option(arg, ret, mitigation_options[i].option))
- continue;
- cmd = mitigation_options[i].cmd;
- break;
- }
+ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
+ if (ret < 0)
+ return SPECTRE_V2_CMD_AUTO;
- if (i >= ARRAY_SIZE(mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_CMD_AUTO;
- }
+ for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
+ if (!match_option(arg, ret, mitigation_options[i].option))
+ continue;
+ cmd = mitigation_options[i].cmd;
+ break;
+ }
+
+ if (i >= ARRAY_SIZE(mitigation_options)) {
+ pr_err("unknown option (%s). Switching to AUTO select\n", arg);
+ return SPECTRE_V2_CMD_AUTO;
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
@@ -314,11 +746,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
- if (mitigation_options[i].secure)
- spec2_print_if_secure(mitigation_options[i].option);
- else
- spec2_print_if_insecure(mitigation_options[i].option);
-
+ spec_v2_print_cond(mitigation_options[i].option,
+ mitigation_options[i].secure);
return cmd;
}
@@ -400,12 +829,6 @@ specv2_set_mode:
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
- /* Initialize Indirect Branch Prediction Barrier if supported */
- if (boot_cpu_has(X86_FEATURE_IBPB)) {
- setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
- pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
- }
-
/*
* Retpoline means the kernel is safe because it has no indirect
* branches. Enhanced IBRS protects firmware too, so, enable restricted
@@ -421,6 +844,107 @@ specv2_set_mode:
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
+
+ /* Set up IBPB and STIBP depending on the general spectre V2 command */
+ spectre_v2_user_select_mitigation(cmd);
+}
+
+static void update_stibp_msr(void * __unused)
+{
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+}
+
+/* Update x86_spec_ctrl_base in case SMT state changed. */
+static void update_stibp_strict(void)
+{
+ u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP;
+
+ if (sched_smt_active())
+ mask |= SPEC_CTRL_STIBP;
+
+ if (mask == x86_spec_ctrl_base)
+ return;
+
+ pr_info("Update user space SMT mitigation: STIBP %s\n",
+ mask & SPEC_CTRL_STIBP ? "always-on" : "off");
+ x86_spec_ctrl_base = mask;
+ on_each_cpu(update_stibp_msr, NULL, 1);
+}
+
+/* Update the static key controlling the evaluation of TIF_SPEC_IB */
+static void update_indir_branch_cond(void)
+{
+ if (sched_smt_active())
+ static_branch_enable(&switch_to_cond_stibp);
+ else
+ static_branch_disable(&switch_to_cond_stibp);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+/* Update the static key controlling the MDS CPU buffer clear in idle */
+static void update_mds_branch_idle(void)
+{
+ /*
+ * Enable the idle clearing if SMT is active on CPUs which are
+ * affected only by MSBDS and not any other MDS variant.
+ *
+ * The other variants cannot be mitigated when SMT is enabled, so
+ * clearing the buffers on idle just to prevent the Store Buffer
+ * repartitioning leak would be a window dressing exercise.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
+ return;
+
+ if (sched_smt_active())
+ static_branch_enable(&mds_idle_clear);
+ else
+ static_branch_disable(&mds_idle_clear);
+}
+
+#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
+
+void arch_smt_update(void)
+{
+ mutex_lock(&spec_ctrl_mutex);
+
+ switch (spectre_v2_user) {
+ case SPECTRE_V2_USER_NONE:
+ break;
+ case SPECTRE_V2_USER_STRICT:
+ update_stibp_strict();
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ update_indir_branch_cond();
+ break;
+ }
+
+ switch (mds_mitigation) {
+ case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_VMWERV:
+ if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+ pr_warn_once(MDS_MSG_SMT);
+ update_mds_branch_idle();
+ break;
+ case MDS_MITIGATION_OFF:
+ break;
+ }
+
+ switch (taa_mitigation) {
+ case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(TAA_MSG_SMT);
+ break;
+ case TAA_MITIGATION_TSX_DISABLED:
+ case TAA_MITIGATION_OFF:
+ break;
+ }
+
+ mutex_unlock(&spec_ctrl_mutex);
}
#undef pr_fmt
@@ -437,7 +961,7 @@ enum ssb_mitigation_cmd {
SPEC_STORE_BYPASS_CMD_SECCOMP,
};
-static const char *ssb_strings[] = {
+static const char * const ssb_strings[] = {
[SPEC_STORE_BYPASS_NONE] = "Vulnerable",
[SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
[SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
@@ -447,7 +971,7 @@ static const char *ssb_strings[] = {
static const struct {
const char *option;
enum ssb_mitigation_cmd cmd;
-} ssb_mitigation_options[] = {
+} ssb_mitigation_options[] __initconst = {
{ "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
{ "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */
{ "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
@@ -461,7 +985,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
char arg[20];
int ret, i;
- if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
+ if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
+ cpu_mitigations_off()) {
return SPEC_STORE_BYPASS_CMD_NONE;
} else {
ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
@@ -523,6 +1048,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
}
/*
+ * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper
+ * bit in the mask to allow guests to use the mitigation even in the
+ * case where the host does not enable it.
+ */
+ if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
+ }
+
+ /*
* We have three CPU feature flags that are in play here:
* - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
* - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
@@ -531,18 +1066,15 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
if (mode == SPEC_STORE_BYPASS_DISABLE) {
setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
/*
- * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
- * a completely different MSR and bit dependent on family.
+ * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
+ * use a completely different MSR and bit dependent on family.
*/
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
+ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ x86_amd_ssb_disable();
+ } else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
- x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
- break;
- case X86_VENDOR_AMD:
- x86_amd_ssb_disable();
- break;
}
}
@@ -560,10 +1092,25 @@ static void ssb_select_mitigation(void)
#undef pr_fmt
#define pr_fmt(fmt) "Speculation prctl: " fmt
-static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+static void task_update_spec_tif(struct task_struct *tsk)
{
- bool update;
+ /* Force the update of the real TIF bits */
+ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE);
+ /*
+ * Immediately update the speculation control MSRs for the current
+ * task, but for a non-current task delay setting the CPU
+ * mitigation until it is scheduled next.
+ *
+ * This can only happen for SECCOMP mitigation. For PRCTL it's
+ * always the current task.
+ */
+ if (tsk == current)
+ speculation_ctrl_update_current();
+}
+
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
return -ENXIO;
@@ -574,28 +1121,56 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
if (task_spec_ssb_force_disable(task))
return -EPERM;
task_clear_spec_ssb_disable(task);
- update = test_and_clear_tsk_thread_flag(task, TIF_SSBD);
+ task_update_spec_tif(task);
break;
case PR_SPEC_DISABLE:
task_set_spec_ssb_disable(task);
- update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
+ task_update_spec_tif(task);
break;
case PR_SPEC_FORCE_DISABLE:
task_set_spec_ssb_disable(task);
task_set_spec_ssb_force_disable(task);
- update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
+ task_update_spec_tif(task);
break;
default:
return -ERANGE;
}
+ return 0;
+}
- /*
- * If being set on non-current task, delay setting the CPU
- * mitigation until it is next scheduled.
- */
- if (task == current && update)
- speculative_store_bypass_update_current();
-
+static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ return 0;
+ /*
+ * Indirect branch speculation is always disabled in strict
+ * mode.
+ */
+ if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
+ return -EPERM;
+ task_clear_spec_ib_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ case PR_SPEC_FORCE_DISABLE:
+ /*
+ * Indirect branch speculation is always allowed when
+ * mitigation is force disabled.
+ */
+ if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ return -EPERM;
+ if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
+ return 0;
+ task_set_spec_ib_disable(task);
+ if (ctrl == PR_SPEC_FORCE_DISABLE)
+ task_set_spec_ib_force_disable(task);
+ task_update_spec_tif(task);
+ break;
+ default:
+ return -ERANGE;
+ }
return 0;
}
@@ -605,6 +1180,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssb_prctl_set(task, ctrl);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_set(task, ctrl);
default:
return -ENODEV;
}
@@ -615,6 +1192,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
{
if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+ if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP)
+ ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
}
#endif
@@ -637,11 +1216,35 @@ static int ssb_prctl_get(struct task_struct *task)
}
}
+static int ib_prctl_get(struct task_struct *task)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+ return PR_SPEC_NOT_AFFECTED;
+
+ switch (spectre_v2_user) {
+ case SPECTRE_V2_USER_NONE:
+ return PR_SPEC_ENABLE;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ if (task_spec_ib_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ib_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ case SPECTRE_V2_USER_STRICT:
+ return PR_SPEC_DISABLE;
+ default:
+ return PR_SPEC_NOT_AFFECTED;
+ }
+}
+
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
{
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssb_prctl_get(task);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_get(task);
default:
return -ENODEV;
}
@@ -656,6 +1259,9 @@ void x86_spec_ctrl_setup_ap(void)
x86_amd_ssb_disable();
}
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+
#undef pr_fmt
#define pr_fmt(fmt) "L1TF: " fmt
@@ -713,6 +1319,11 @@ static void __init l1tf_select_mitigation(void)
if (!boot_cpu_has_bug(X86_BUG_L1TF))
return;
+ if (cpu_mitigations_off())
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ else if (cpu_mitigations_auto_nosmt())
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+
override_cache_bits(&boot_cpu_data);
switch (l1tf_mitigation) {
@@ -735,12 +1346,13 @@ static void __init l1tf_select_mitigation(void)
#endif
half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
- if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
+ if (l1tf_mitigation != L1TF_MITIGATION_OFF &&
+ e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
half_pa);
pr_info("However, doing so will make a part of your RAM unusable.\n");
- pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n");
+ pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
return;
}
@@ -773,13 +1385,14 @@ static int __init l1tf_cmdline(char *str)
early_param("l1tf", l1tf_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) fmt
#ifdef CONFIG_SYSFS
#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
#if IS_ENABLED(CONFIG_KVM_INTEL)
-static const char *l1tf_vmx_states[] = {
+static const char * const l1tf_vmx_states[] = {
[VMENTER_L1D_FLUSH_AUTO] = "auto",
[VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
[VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
@@ -795,21 +1408,99 @@ static ssize_t l1tf_show_state(char *buf)
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
(l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
- cpu_smt_control == CPU_SMT_ENABLED))
+ sched_smt_active())) {
return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
l1tf_vmx_states[l1tf_vmx_mitigation]);
+ }
return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
l1tf_vmx_states[l1tf_vmx_mitigation],
- cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled");
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ if (itlb_multihit_kvm_mitigation)
+ return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+ else
+ return sprintf(buf, "KVM: Vulnerable\n");
}
#else
static ssize_t l1tf_show_state(char *buf)
{
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ return sprintf(buf, "Processor vulnerable\n");
+}
#endif
+static ssize_t mds_show_state(char *buf)
+{
+#ifdef CONFIG_HYPERVISOR_GUEST
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sprintf(buf, "%s; SMT Host state unknown\n",
+ mds_strings[mds_mitigation]);
+ }
+#endif
+
+ if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
+ return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
+ sched_smt_active() ? "mitigated" : "disabled"));
+ }
+
+ return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t tsx_async_abort_show_state(char *buf)
+{
+ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
+ (taa_mitigation == TAA_MITIGATION_OFF))
+ return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sprintf(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
+ }
+
+ return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static char *stibp_state(void)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ return "";
+
+ switch (spectre_v2_user) {
+ case SPECTRE_V2_USER_NONE:
+ return ", STIBP: disabled";
+ case SPECTRE_V2_USER_STRICT:
+ return ", STIBP: forced";
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ if (static_key_enabled(&switch_to_cond_stibp))
+ return ", STIBP: conditional";
+ }
+ return "";
+}
+
+static char *ibpb_state(void)
+{
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ if (static_key_enabled(&switch_mm_always_ibpb))
+ return ", IBPB: always-on";
+ if (static_key_enabled(&switch_mm_cond_ibpb))
+ return ", IBPB: conditional";
+ return ", IBPB: disabled";
+ }
+ return "";
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -824,12 +1515,14 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
break;
case X86_BUG_SPECTRE_V1:
- return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+ return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
case X86_BUG_SPECTRE_V2:
- return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
- boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
+ return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
spectre_v2_module_string());
case X86_BUG_SPEC_STORE_BYPASS:
@@ -839,6 +1532,16 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
return l1tf_show_state(buf);
break;
+
+ case X86_BUG_MDS:
+ return mds_show_state(buf);
+
+ case X86_BUG_TAA:
+ return tsx_async_abort_show_state(buf);
+
+ case X86_BUG_ITLB_MULTIHIT:
+ return itlb_multihit_show_state(buf);
+
default:
break;
}
@@ -870,4 +1573,19 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b
{
return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
}
+
+ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
+}
+
+ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
+}
+
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3c01610c5ba9..f490a4fab2f7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -388,7 +388,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
* cpuid bit to be set. We need to ensure that we
* update that bit in this CPU's "cpu_info".
*/
- get_cpu_cap(c);
+ set_cpu_cap(c, X86_FEATURE_OSPKE);
}
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -752,6 +752,12 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_STIBP);
set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
}
+
+ if (cpu_has(c, X86_FEATURE_AMD_SSBD)) {
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD);
+ }
}
void get_cpu_cap(struct cpuinfo_x86 *c)
@@ -885,84 +891,134 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
c->x86_cache_bits = c->x86_phys_bits;
}
-static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
- { X86_VENDOR_CENTAUR, 5 },
- { X86_VENDOR_INTEL, 5 },
- { X86_VENDOR_NSC, 5 },
- { X86_VENDOR_ANY, 4 },
- {}
-};
+#define NO_SPECULATION BIT(0)
+#define NO_MELTDOWN BIT(1)
+#define NO_SSB BIT(2)
+#define NO_L1TF BIT(3)
+#define NO_MDS BIT(4)
+#define MSBDS_ONLY BIT(5)
+#define NO_SWAPGS BIT(6)
+#define NO_ITLB_MULTIHIT BIT(7)
-static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
- { X86_VENDOR_AMD },
- {}
-};
+#define VULNWL(_vendor, _family, _model, _whitelist) \
+ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
-static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
- { X86_VENDOR_CENTAUR, 5, },
- { X86_VENDOR_INTEL, 5, },
- { X86_VENDOR_NSC, 5, },
- { X86_VENDOR_AMD, 0x12, },
- { X86_VENDOR_AMD, 0x11, },
- { X86_VENDOR_AMD, 0x10, },
- { X86_VENDOR_AMD, 0xf, },
- { X86_VENDOR_ANY, 4, },
- {}
-};
+#define VULNWL_INTEL(model, whitelist) \
+ VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
+
+#define VULNWL_AMD(family, whitelist) \
+ VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
+
+static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
+ VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
+
+ /* Intel Family 6 */
+ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(CORE_YONAH, NO_SSB),
+
+ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ /*
+ * Technically, swapgs isn't serializing on AMD (despite it previously
+ * being documented as such in the APM). But according to AMD, %gs is
+ * updated non-speculatively, and the issuing of %gs-relative memory
+ * operands will be blocked until the %gs update completes, which is
+ * good enough for our purposes.
+ */
+
+ /* AMD Family 0xf - 0x12 */
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
-static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
- /* in addition to cpu_no_speculation */
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
+ /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
{}
};
-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+static bool __init cpu_matches(unsigned long which)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
+
+ return m && !!(m->driver_data & which);
+}
+
+u64 x86_read_arch_cap_msr(void)
{
u64 ia32_cap = 0;
- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
- if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
- !(ia32_cap & ARCH_CAP_SSB_NO))
- setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+ return ia32_cap;
+}
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
+ if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
- if (x86_match_cpu(cpu_no_speculation))
+ if (cpu_matches(NO_SPECULATION))
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
+ setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
if (ia32_cap & ARCH_CAP_IBRS_ALL)
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
- if (x86_match_cpu(cpu_no_meltdown))
+ if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
+ setup_force_cpu_bug(X86_BUG_MDS);
+ if (cpu_matches(MSBDS_ONLY))
+ setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
+ }
+
+ if (!cpu_matches(NO_SWAPGS))
+ setup_force_cpu_bug(X86_BUG_SWAPGS);
+
+ /*
+ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
+ * - TSX is supported or
+ * - TSX_CTRL is present
+ *
+ * TSX_CTRL check is needed for cases when TSX could be disabled before
+ * the kernel boot e.g. kexec.
+ * TSX_CTRL check alone is not sufficient for cases when the microcode
+ * update is not present or running as guest that don't get TSX_CTRL.
+ */
+ if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+ (cpu_has(c, X86_FEATURE_RTM) ||
+ (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+ setup_force_cpu_bug(X86_BUG_TAA);
+
+ if (cpu_matches(NO_MELTDOWN))
return;
/* Rogue Data Cache Load? No! */
@@ -971,7 +1027,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
- if (x86_match_cpu(cpu_no_l1tf))
+ if (cpu_matches(NO_L1TF))
return;
setup_force_cpu_bug(X86_BUG_L1TF);
@@ -1380,6 +1436,8 @@ void __init identify_boot_cpu(void)
enable_sep_cpu();
#endif
cpu_detect_tlb(&boot_cpu_data);
+
+ tsx_init();
}
void identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 2275900d4d1b..4350f50b5deb 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -44,6 +44,22 @@ struct _tlb_table {
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
+#ifdef CONFIG_CPU_SUP_INTEL
+enum tsx_ctrl_states {
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
+
+extern void __init tsx_init(void);
+extern void tsx_enable(void);
+extern void tsx_disable(void);
+#else
+static inline void tsx_init(void) { }
+#endif /* CONFIG_CPU_SUP_INTEL */
+
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
@@ -51,4 +67,6 @@ extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void x86_spec_ctrl_setup_ap(void);
+extern u64 x86_read_arch_cap_msr(void);
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index d39cfb2c6b63..a4f6e0ec4ba0 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -121,7 +121,7 @@ static void set_cx86_reorder(void)
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* Load/Store Serialize to mem access disable (=reorder it) */
- setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
+ setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
/* set load/store serialize from 1GB to 4GB */
ccr3 |= 0xe0;
setCx86(CX86_CCR3, ccr3);
@@ -132,11 +132,11 @@ static void set_cx86_memwb(void)
pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
/* CCR2 bit 2: unlock NW bit */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
/* set 'Not Write-through' */
write_cr0(read_cr0() | X86_CR0_NW);
/* CCR2 bit 2: lock NW bit and set WT1 */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
}
/*
@@ -150,14 +150,14 @@ static void geode_configure(void)
local_irq_save(flags);
/* Suspend on halt power saving and enable #SUSP pin */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* FPU fast, DTE cache, Mem bypass */
- setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
set_cx86_memwb();
@@ -293,7 +293,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
/* Enable cxMMX extensions (GX1 Datasheet 54) */
- setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
/*
* GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -316,7 +316,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
if (dir1 > 7) {
dir0_msn++; /* M II */
/* Enable MMX extensions (App note 108) */
- setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
} else {
/* A 6x86MX - it has the bug. */
set_cpu_bug(c, X86_BUG_COMA);
@@ -434,7 +434,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c)
/* enable MAPEN */
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
/* enable cpuid */
- setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80);
/* disable MAPEN */
setCx86(CX86_CCR3, ccr3);
local_irq_restore(flags);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cee0fec0d232..476a9d5c2f35 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -14,6 +14,7 @@
#include <asm/bugs.h>
#include <asm/cpu.h>
#include <asm/intel-family.h>
+#include <asm/microcode_intel.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -137,14 +138,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
- unsigned lower_word;
-
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
- /* Required by the SDM */
- sync_core();
- rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
- }
+ if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
+ c->microcode = intel_get_microcode_revision();
/* Now if any of them are set, check the blacklist and clear the lot */
if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
@@ -647,6 +642,11 @@ static void init_intel(struct cpuinfo_x86 *c)
detect_vmx_virtcap(c);
init_intel_energy_perf(c);
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 3e0199ee5a2f..0372913e0134 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -148,6 +148,11 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
KERNEL
),
+ MCESEV(
+ PANIC, "Instruction fetch error in kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ KERNEL
+ ),
#endif
MCESEV(
PANIC, "Action required: unknown MCACOD",
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 25310d2b8609..07188a012492 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -139,6 +139,8 @@ void mce_setup(struct mce *m)
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+
+ m->microcode = boot_cpu_data.microcode;
}
DEFINE_PER_CPU(struct mce, injectm);
@@ -309,7 +311,7 @@ static void print_mce(struct mce *m)
*/
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
- cpu_data(m->extcpu).microcode);
+ m->microcode);
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}
@@ -671,20 +673,50 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
barrier();
m.status = mce_rdmsrl(msr_ops.status(i));
+
+ /* If this entry is not valid, ignore it */
if (!(m.status & MCI_STATUS_VAL))
continue;
/*
- * Uncorrected or signalled events are handled by the exception
- * handler when it is enabled, so don't process those here.
- *
- * TBD do the same check for MCI_STATUS_EN here?
+ * If we are logging everything (at CPU online) or this
+ * is a corrected error, then we must log it.
*/
- if (!(flags & MCP_UC) &&
- (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
- continue;
+ if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
+ goto log_it;
+
+ /*
+ * Newer Intel systems that support software error
+ * recovery need to make additional checks. Other
+ * CPUs should skip over uncorrected errors, but log
+ * everything else.
+ */
+ if (!mca_cfg.ser) {
+ if (m.status & MCI_STATUS_UC)
+ continue;
+ goto log_it;
+ }
+
+ /* Log "not enabled" (speculative) errors */
+ if (!(m.status & MCI_STATUS_EN))
+ goto log_it;
+ /*
+ * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
+ * UC == 1 && PCC == 0 && S == 0
+ */
+ if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
+ goto log_it;
+
+ /*
+ * Skip anything else. Presumption is that our read of this
+ * bank is racing with a machine check. Leave the log alone
+ * for do_machine_check() to deal with it.
+ */
+ continue;
+
+log_it:
error_seen = true;
mce_read_aux(&m, i);
@@ -750,8 +782,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
if (quirk_no_way_out)
quirk_no_way_out(i, m, regs);
+ m->bank = i;
if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
- m->bank = i;
mce_read_aux(m, i);
*msg = tmp;
return 1;
@@ -1616,36 +1648,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;
- /*
- * Turn off MC4_MISC thresholding banks on those models since
- * they're not supported there.
- */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
- int i;
- u64 hwcr;
- bool need_toggle;
- u32 msrs[] = {
- 0x00000413, /* MC4_MISC0 */
- 0xc0000408, /* MC4_MISC1 */
- };
-
- rdmsrl(MSR_K7_HWCR, hwcr);
-
- /* McStatusWrEn has to be set */
- need_toggle = !(hwcr & BIT(18));
-
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
-
- /* Clear CntP bit safely */
- for (i = 0; i < ARRAY_SIZE(msrs); i++)
- msr_clear_bit(msrs[i], 62);
-
- /* restore old settings */
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr);
- }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 39526e1e3132..775d5f028fe8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -499,6 +499,40 @@ out:
return offset;
}
+/*
+ * Turn off MC4_MISC thresholding banks on all family 0x15 models since
+ * they're not supported there.
+ */
+void disable_err_thresholding(struct cpuinfo_x86 *c)
+{
+ int i;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
+ 0x00000413, /* MC4_MISC0 */
+ 0xc0000408, /* MC4_MISC1 */
+ };
+
+ if (c->x86 != 0x15)
+ return;
+
+ rdmsrl(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+ /* Clear CntP bit safely */
+ for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ msr_clear_bit(msrs[i], 62);
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+}
+
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@@ -506,6 +540,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int bank, block, cpu = smp_processor_id();
int offset = -1;
+ disable_err_thresholding(c);
+
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
get_smca_bank_info(bank);
@@ -810,9 +846,12 @@ static const struct sysfs_ops threshold_ops = {
.store = store,
};
+static void threshold_block_release(struct kobject *kobj);
+
static struct kobj_type threshold_ktype = {
.sysfs_ops = &threshold_ops,
.default_attrs = default_attrs,
+ .release = threshold_block_release,
};
static const char *get_name(unsigned int bank, struct threshold_block *b)
@@ -843,8 +882,9 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return buf_mcatype;
}
-static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
- unsigned int block, u32 address)
+static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb,
+ unsigned int bank, unsigned int block,
+ u32 address)
{
struct threshold_block *b = NULL;
u32 low, high;
@@ -888,16 +928,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
INIT_LIST_HEAD(&b->miscj);
- if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
- list_add(&b->miscj,
- &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
- } else {
- per_cpu(threshold_banks, cpu)[bank]->blocks = b;
- }
+ if (tb->blocks)
+ list_add(&b->miscj, &tb->blocks->miscj);
+ else
+ tb->blocks = b;
- err = kobject_init_and_add(&b->kobj, &threshold_ktype,
- per_cpu(threshold_banks, cpu)[bank]->kobj,
- get_name(bank, b));
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
if (err)
goto out_free;
recurse:
@@ -905,7 +941,7 @@ recurse:
if (!address)
return 0;
- err = allocate_threshold_blocks(cpu, bank, block, address);
+ err = allocate_threshold_blocks(cpu, tb, bank, block, address);
if (err)
goto out_free;
@@ -990,8 +1026,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out_free;
}
- per_cpu(threshold_banks, cpu)[bank] = b;
-
if (is_shared_bank(bank)) {
atomic_set(&b->cpus, 1);
@@ -1002,9 +1036,13 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
}
}
- err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
- if (!err)
- goto out;
+ err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
+ if (err)
+ goto out_free;
+
+ per_cpu(threshold_banks, cpu)[bank] = b;
+
+ return 0;
out_free:
kfree(b);
@@ -1038,8 +1076,12 @@ static int threshold_create_device(unsigned int cpu)
return err;
}
-static void deallocate_threshold_block(unsigned int cpu,
- unsigned int bank)
+static void threshold_block_release(struct kobject *kobj)
+{
+ kfree(to_block(kobj));
+}
+
+static void deallocate_threshold_block(unsigned int cpu, unsigned int bank)
{
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
@@ -1049,13 +1091,11 @@ static void deallocate_threshold_block(unsigned int cpu,
return;
list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
- kobject_put(&pos->kobj);
list_del(&pos->miscj);
- kfree(pos);
+ kobject_put(&pos->kobj);
}
- kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
- per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+ kobject_put(&head->blocks->kobj);
}
static void __threshold_remove_blocks(struct threshold_bank *b)
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index c460c91d0c8f..be2439592b0e 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ static int therm_throt_process(bool new_event, int event, int level)
/* if we just entered the thermal event */
if (new_event) {
if (event == THERMAL_THROTTLING_EVENT)
- pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+ pr_warn("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
this_cpu,
level == CORE_LEVEL ? "Core" : "Package",
state->count);
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 732bb03fcf91..a19fddfb6bf8 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -707,22 +707,26 @@ int apply_microcode_amd(int cpu)
return -1;
/* need to apply patch? */
- if (rev >= mc_amd->hdr.patch_id) {
- c->microcode = rev;
- uci->cpu_sig.rev = rev;
- return 0;
- }
+ if (rev >= mc_amd->hdr.patch_id)
+ goto out;
if (__apply_microcode_amd(mc_amd)) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
return -1;
}
- pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
- mc_amd->hdr.patch_id);
- uci->cpu_sig.rev = mc_amd->hdr.patch_id;
- c->microcode = mc_amd->hdr.patch_id;
+ rev = mc_amd->hdr.patch_id;
+
+ pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
+
+out:
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
+
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
return 0;
}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 79291d6fb301..1308abfc4758 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -386,15 +386,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig.pf = 1 << ((val[1] >> 18) & 7);
}
- native_wrmsrl(MSR_IA32_UCODE_REV, 0);
- /* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
-
- /* get the current revision from MSR 0x8B */
- native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
- csig.rev = val[1];
+ csig.rev = intel_get_microcode_revision();
uci->cpu_sig = csig;
uci->valid = 1;
@@ -618,29 +611,35 @@ static inline void print_ucode(struct ucode_cpu_info *uci)
static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
{
struct microcode_intel *mc;
- unsigned int val[2];
+ u32 rev;
mc = uci->mc;
if (!mc)
return 0;
+ /*
+ * Save us the MSR write below - which is a particular expensive
+ * operation - when the other hyperthread has updated the microcode
+ * already.
+ */
+ rev = intel_get_microcode_revision();
+ if (rev >= mc->hdr.rev) {
+ uci->cpu_sig.rev = rev;
+ return 0;
+ }
+
/* write microcode via MSR 0x79 */
native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
- native_wrmsrl(MSR_IA32_UCODE_REV, 0);
-
- /* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
- /* get the current revision from MSR 0x8B */
- native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- if (val[1] != mc->hdr.rev)
+ rev = intel_get_microcode_revision();
+ if (rev != mc->hdr.rev)
return -1;
#ifdef CONFIG_X86_64
/* Flush global tlb. This is precaution. */
flush_tlb_early();
#endif
- uci->cpu_sig.rev = val[1];
+ uci->cpu_sig.rev = rev;
if (early)
print_ucode(uci);
@@ -903,9 +902,9 @@ static int apply_microcode_intel(int cpu)
{
struct microcode_intel *mc;
struct ucode_cpu_info *uci;
- struct cpuinfo_x86 *c;
- unsigned int val[2];
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
static int prev_rev;
+ u32 rev;
/* We should bind the task to the CPU */
if (WARN_ON(raw_smp_processor_id() != cpu))
@@ -924,35 +923,42 @@ static int apply_microcode_intel(int cpu)
if (!get_matching_mc(mc, cpu))
return 0;
+ /*
+ * Save us the MSR write below - which is a particular expensive
+ * operation - when the other hyperthread has updated the microcode
+ * already.
+ */
+ rev = intel_get_microcode_revision();
+ if (rev >= mc->hdr.rev)
+ goto out;
+
/* write microcode via MSR 0x79 */
wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
- wrmsrl(MSR_IA32_UCODE_REV, 0);
- /* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
+ rev = intel_get_microcode_revision();
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
- if (val[1] != mc->hdr.rev) {
+ if (rev != mc->hdr.rev) {
pr_err("CPU%d update to revision 0x%x failed\n",
cpu, mc->hdr.rev);
return -1;
}
- if (val[1] != prev_rev) {
+ if (rev != prev_rev) {
pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
- val[1],
+ rev,
mc->hdr.date & 0xffff,
mc->hdr.date >> 24,
(mc->hdr.date >> 16) & 0xff);
- prev_rev = val[1];
+ prev_rev = rev;
}
- c = &cpu_data(cpu);
+out:
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
- uci->cpu_sig.rev = val[1];
- c->microcode = val[1];
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
return 0;
}
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 6988c74409a8..711b74e0e623 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -3,6 +3,8 @@
# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
#
+set -e
+
IN=$1
OUT=$2
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000000..032509adf9de
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Transactional Synchronization Extensions (TSX) control.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author:
+ * Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/cmdline.h>
+
+#include "cpu.h"
+
+enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
+
+void tsx_disable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Force all transactions to immediately abort */
+ tsx |= TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is not enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * do not waste resources trying TSX transactions that
+ * will always abort.
+ */
+ tsx |= TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+void tsx_enable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Enable the RTM feature in the cpu */
+ tsx &= ~TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * can enumerate and use the TSX feature.
+ */
+ tsx &= ~TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static bool __init tsx_ctrl_is_supported(void)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
+}
+
+static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ return TSX_CTRL_DISABLE;
+
+ return TSX_CTRL_ENABLE;
+}
+
+void __init tsx_init(void)
+{
+ char arg[5] = {};
+ int ret;
+
+ if (!tsx_ctrl_is_supported())
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
+ if (ret >= 0) {
+ if (!strcmp(arg, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(arg, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(arg, "auto")) {
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("tsx: invalid option, defaulting to off\n");
+ }
+ } else {
+ /* tsx= not provided */
+ if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ else
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ }
+
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
+ tsx_disable();
+
+ /*
+ * tsx_disable() will change the state of the RTM and HLE CPUID
+ * bits. Clear them here since they are now expected to be not
+ * set.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
+
+ /*
+ * HW defaults TSX to be enabled at bootup.
+ * We may still need the TSX enable support
+ * during init for special cases like
+ * kexec after TSX is disabled.
+ */
+ tsx_enable();
+
+ /*
+ * tsx_enable() will change the state of the RTM and HLE CPUID
+ * bits. Force them here since they are now expected to be set.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RTM);
+ setup_force_cpu_cap(X86_FEATURE_HLE);
+ }
+}
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 756634f14df6..775c23d4021a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -914,6 +914,8 @@ int __init hpet_enable(void)
return 0;
hpet_set_mapping();
+ if (!hpet_virt_address)
+ return 0;
/*
* Read the period and check for a sane value:
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 8771766d46b6..9954a604a822 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -352,6 +352,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
#endif
default:
WARN_ON_ONCE(1);
+ return -EINVAL;
}
/*
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index bcd1b82c86e8..005e9a77a664 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -25,9 +25,18 @@ int sysctl_panic_on_stackoverflow;
/*
* Probabilistic stack overflow check:
*
- * Only check the stack in process context, because everything else
- * runs on the big interrupt stacks. Checking reliably is too expensive,
- * so we just check from interrupts.
+ * Regular device interrupts can enter on the following stacks:
+ *
+ * - User stack
+ *
+ * - Kernel task stack
+ *
+ * - Interrupt stack if a device driver reenables interrupts
+ * which should only happen in really old drivers.
+ *
+ * - Debug IST stack
+ *
+ * All other contexts are invalid.
*/
static inline void stack_overflow_check(struct pt_regs *regs)
{
@@ -52,8 +61,8 @@ static inline void stack_overflow_check(struct pt_regs *regs)
return;
oist = this_cpu_ptr(&orig_ist);
- estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
- estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
+ estack_bottom = (u64)oist->ist[DEBUG_STACK];
+ estack_top = estack_bottom - DEBUG_STKSZ + STACK_TOP_MARGIN;
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8e36f249646e..904e18bb38c5 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -438,7 +438,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs)
*/
void kgdb_roundup_cpus(unsigned long flags)
{
- apic->send_IPI_allbutself(APIC_DM_NMI);
+ apic->send_IPI_allbutself(NMI_VECTOR);
}
#endif
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 64a70b2e2285..dcd6df5943d6 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -376,6 +376,10 @@ int __copy_instruction(u8 *dest, u8 *src)
return 0;
memcpy(dest, insn.kaddr, length);
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(&insn))
+ return 0;
+
#ifdef CONFIG_X86_64
if (insn_rip_relative(&insn)) {
s64 newdisp;
@@ -545,6 +549,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
unsigned long *sara = stack_addr(regs);
ri->ret_addr = (kprobe_opcode_t *) *sara;
+ ri->fp = sara;
/* Replace the return addr with trampoline addr */
*sara = (unsigned long) &kretprobe_trampoline;
@@ -746,15 +751,21 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
kprobe_opcode_t *correct_ret_addr = NULL;
+ void *frame_pointer;
+ bool skipped = false;
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
/* fixup registers */
#ifdef CONFIG_X86_64
regs->cs = __KERNEL_CS;
+ /* On x86-64, we use pt_regs->sp for return address holder. */
+ frame_pointer = &regs->sp;
#else
regs->cs = __KERNEL_CS | get_kernel_rpl();
regs->gs = 0;
+ /* On x86-32, we use pt_regs->flags for return address holder. */
+ frame_pointer = &regs->flags;
#endif
regs->ip = trampoline_address;
regs->orig_ax = ~0UL;
@@ -776,8 +787,25 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
+ /*
+ * Return probes must be pushed on this hash list correct
+ * order (same as return order) so that it can be poped
+ * correctly. However, if we find it is pushed it incorrect
+ * order, this means we find a function which should not be
+ * probed, because the wrong order entry is pushed on the
+ * path of processing other kretprobe itself.
+ */
+ if (ri->fp != frame_pointer) {
+ if (!skipped)
+ pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n");
+ skipped = true;
+ continue;
+ }
orig_ret_address = (unsigned long)ri->ret_addr;
+ if (skipped)
+ pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n",
+ ri->rp->kp.addr);
if (orig_ret_address != trampoline_address)
/*
@@ -795,6 +823,8 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
+ if (ri->fp != frame_pointer)
+ continue;
orig_ret_address = (unsigned long)ri->ret_addr;
if (ri->rp && ri->rp->handler) {
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bfe4d6c96fbd..6b7b35d80264 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -32,6 +32,7 @@
#include <asm/x86_init.h>
#include <asm/reboot.h>
#include <asm/cache.h>
+#include <asm/nospec-branch.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -544,6 +545,9 @@ nmi_restart:
write_cr2(this_cpu_read(nmi_cr2));
if (this_cpu_dec_return(nmi_state))
goto nmi_restart;
+
+ if (user_mode(regs))
+ mds_user_clear_cpu_buffers();
}
NOKPROBE_SYMBOL(do_nmi);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e9195a139d4e..f1f3c471438f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -35,6 +35,8 @@
#include <asm/switch_to.h>
#include <asm/spec-ctrl.h>
+#include "process.h"
+
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
* no more per-task TSS's. The TSS size is kept cacheline-aligned
@@ -170,11 +172,12 @@ int set_tsc_mode(unsigned int val)
return 0;
}
-static inline void switch_to_bitmap(struct tss_struct *tss,
- struct thread_struct *prev,
+static inline void switch_to_bitmap(struct thread_struct *prev,
struct thread_struct *next,
unsigned long tifp, unsigned long tifn)
{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
+
if (tifn & _TIF_IO_BITMAP) {
/*
* Copy the relevant range of the IO bitmap.
@@ -308,32 +311,85 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
}
-static __always_inline void intel_set_ssb_state(unsigned long tifn)
+/*
+ * Update the MSRs managing speculation control, during context switch.
+ *
+ * tifp: Previous task's thread flags
+ * tifn: Next task's thread flags
+ */
+static __always_inline void __speculation_ctrl_update(unsigned long tifp,
+ unsigned long tifn)
{
- u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
+ unsigned long tif_diff = tifp ^ tifn;
+ u64 msr = x86_spec_ctrl_base;
+ bool updmsr = false;
+
+ /*
+ * If TIF_SSBD is different, select the proper mitigation
+ * method. Note that if SSBD mitigation is disabled or permanentely
+ * enabled this branch can't be taken because nothing can set
+ * TIF_SSBD.
+ */
+ if (tif_diff & _TIF_SSBD) {
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ amd_set_ssb_virt_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ amd_set_core_ssb_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ msr |= ssbd_tif_to_spec_ctrl(tifn);
+ updmsr = true;
+ }
+ }
+
+ /*
+ * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
+ * otherwise avoid the MSR write.
+ */
+ if (IS_ENABLED(CONFIG_SMP) &&
+ static_branch_unlikely(&switch_to_cond_stibp)) {
+ updmsr |= !!(tif_diff & _TIF_SPEC_IB);
+ msr |= stibp_tif_to_spec_ctrl(tifn);
+ }
- wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+ if (updmsr)
+ wrmsrl(MSR_IA32_SPEC_CTRL, msr);
}
-static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
+static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
{
- if (static_cpu_has(X86_FEATURE_VIRT_SSBD))
- amd_set_ssb_virt_state(tifn);
- else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
- amd_set_core_ssb_state(tifn);
- else
- intel_set_ssb_state(tifn);
+ if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
+ if (task_spec_ssb_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SSBD);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SSBD);
+
+ if (task_spec_ib_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ }
+ /* Return the updated threadinfo flags*/
+ return task_thread_info(tsk)->flags;
}
-void speculative_store_bypass_update(unsigned long tif)
+void speculation_ctrl_update(unsigned long tif)
{
+ /* Forced update. Make sure all relevant TIF flags are different */
preempt_disable();
- __speculative_store_bypass_update(tif);
+ __speculation_ctrl_update(~tif, tif);
preempt_enable();
}
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
- struct tss_struct *tss)
+/* Called from seccomp/prctl update */
+void speculation_ctrl_update_current(void)
+{
+ preempt_disable();
+ speculation_ctrl_update(speculation_ctrl_update_tif(current));
+ preempt_enable();
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev, *next;
unsigned long tifp, tifn;
@@ -343,7 +399,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
tifn = READ_ONCE(task_thread_info(next_p)->flags);
tifp = READ_ONCE(task_thread_info(prev_p)->flags);
- switch_to_bitmap(tss, prev, next, tifp, tifn);
+ switch_to_bitmap(prev, next, tifp, tifn);
propagate_user_return_notify(prev_p, next_p);
@@ -361,8 +417,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
if ((tifp ^ tifn) & _TIF_NOTSC)
cr4_toggle_bits(X86_CR4_TSD);
- if ((tifp ^ tifn) & _TIF_SSBD)
- __speculative_store_bypass_update(tifn);
+ if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
+ __speculation_ctrl_update(tifp, tifn);
+ } else {
+ speculation_ctrl_update_tif(prev_p);
+ tifn = speculation_ctrl_update_tif(next_p);
+
+ /* Enforce MSR update to ensure consistent state */
+ __speculation_ctrl_update(~tifn, tifn);
+ }
}
/*
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
new file mode 100644
index 000000000000..898e97cf6629
--- /dev/null
+++ b/arch/x86/kernel/process.h
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Code shared between 32 and 64 bit
+
+#include <asm/spec-ctrl.h>
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
+
+/*
+ * This needs to be inline to optimize for the common case where no extra
+ * work needs to be done.
+ */
+static inline void switch_to_extra(struct task_struct *prev,
+ struct task_struct *next)
+{
+ unsigned long next_tif = task_thread_info(next)->flags;
+ unsigned long prev_tif = task_thread_info(prev)->flags;
+
+ if (IS_ENABLED(CONFIG_SMP)) {
+ /*
+ * Avoid __switch_to_xtra() invocation when conditional
+ * STIPB is disabled and the only different bit is
+ * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
+ * in the TIF_WORK_CTXSW masks.
+ */
+ if (!static_branch_likely(&switch_to_cond_stibp)) {
+ prev_tif &= ~_TIF_SPEC_IB;
+ next_tif &= ~_TIF_SPEC_IB;
+ }
+ }
+
+ /*
+ * __switch_to_xtra() handles debug registers, i/o bitmaps,
+ * speculation mitigations etc.
+ */
+ if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
+ prev_tif & _TIF_WORK_CTXSW_PREV))
+ __switch_to_xtra(prev, next);
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bd7be8efdc4c..4ca26fc7aa89 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,6 +55,8 @@
#include <asm/switch_to.h>
#include <asm/vm86.h>
+#include "process.h"
+
void __show_regs(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -127,6 +129,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
struct task_struct *tsk;
int err;
+ /*
+ * For a new task use the RESET flags value since there is no before.
+ * All the status flags are zero; DF and all the system flags must also
+ * be 0, specifically IF must be 0 because we context switch to the new
+ * task with interrupts disabled.
+ */
+ frame->flags = X86_EFLAGS_FIXED;
frame->bp = 0;
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
@@ -264,12 +273,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl);
- /*
- * Now maybe handle debug registers and/or IO bitmaps
- */
- if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
- task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
- __switch_to_xtra(prev_p, next_p, tss);
+ switch_to_extra(prev_p, next_p);
/*
* Leave lazy mode, flushing any hypercalls made here.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a2661814bde0..6d6c15cd9b9a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,6 +51,8 @@
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
+#include "process.h"
+
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
/* Prints also some state that isn't saved in the pt_regs */
@@ -266,6 +268,14 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
+
+ /*
+ * For a new task use the RESET flags value since there is no before.
+ * All the status flags are zero; DF and all the system flags must also
+ * be 0, specifically IF must be 0 because we context switch to the new
+ * task with interrupts disabled.
+ */
+ frame->flags = X86_EFLAGS_FIXED;
frame->bp = 0;
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
@@ -454,12 +464,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Reload esp0 and ss1. This changes current_thread_info(). */
load_sp0(tss, next);
- /*
- * Now maybe reload the debug registers and handle I/O bitmaps
- */
- if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
- task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
- __switch_to_xtra(prev_p, next_p, tss);
+ switch_to_extra(prev_p, next_p);
#ifdef CONFIG_XEN
/*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e497d374412a..7f377f8792aa 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -23,6 +23,7 @@
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
+#include <linux/nospec.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -652,7 +653,8 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
unsigned long val = 0;
if (n < HBP_NUM) {
- struct perf_event *bp = thread->ptrace_bps[n];
+ int index = array_index_nospec(n, HBP_NUM);
+ struct perf_event *bp = thread->ptrace_bps[index];
if (bp)
val = bp->hw.info.address;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 4a12362a194a..c55b11fe8e9f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -82,6 +82,19 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
return 0;
}
+/*
+ * Some machines don't handle the default ACPI reboot method and
+ * require the EFI reboot method:
+ */
+static int __init set_efi_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) {
+ reboot_type = BOOT_EFI;
+ pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident);
+ }
+ return 0;
+}
+
void __noreturn machine_real_restart(unsigned int type)
{
local_irq_disable();
@@ -167,6 +180,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
},
},
+ { /* Handle reboot issue on Acer TravelMate X514-51T */
+ .callback = set_efi_reboot,
+ .ident = "Acer TravelMate X514-51T",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"),
+ },
+ },
/* Apple */
{ /* Handle problems with rebooting on Apple MacBook5 */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b1a5d252d482..ca010dfb9682 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -129,16 +129,6 @@ static int restore_sigcontext(struct pt_regs *regs,
COPY_SEG_CPL3(cs);
COPY_SEG_CPL3(ss);
-#ifdef CONFIG_X86_64
- /*
- * Fix up SS if needed for the benefit of old DOSEMU and
- * CRIU.
- */
- if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) &&
- user_64bit_mode(regs)))
- force_valid_ss(regs);
-#endif
-
get_user_ex(tmpflags, &sc->flags);
regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
regs->orig_ax = -1; /* disable syscall checks */
@@ -147,6 +137,15 @@ static int restore_sigcontext(struct pt_regs *regs,
buf = (void __user *)buf_val;
} get_user_catch(err);
+#ifdef CONFIG_X86_64
+ /*
+ * Fix up SS if needed for the benefit of old DOSEMU and
+ * CRIU.
+ */
+ if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
+ force_valid_ss(regs);
+#endif
+
err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
force_iret();
@@ -458,6 +457,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
{
struct rt_sigframe __user *frame;
void __user *fp = NULL;
+ unsigned long uc_flags;
int err = 0;
frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
@@ -470,9 +470,11 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
return -EFAULT;
}
+ uc_flags = frame_uc_flags(regs);
+
put_user_try {
/* Create the ucontext. */
- put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
+ put_user_ex(uc_flags, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
save_altstack_ex(&frame->uc.uc_stack, regs->sp);
@@ -538,6 +540,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
{
#ifdef CONFIG_X86_X32_ABI
struct rt_sigframe_x32 __user *frame;
+ unsigned long uc_flags;
void __user *restorer;
int err = 0;
void __user *fpstate = NULL;
@@ -552,9 +555,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
return -EFAULT;
}
+ uc_flags = frame_uc_flags(regs);
+
put_user_try {
/* Create the ucontext. */
- put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
+ put_user_ex(uc_flags, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
put_user_ex(0, &frame->uc.uc__pad0);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 2863ad306692..33ba47c44816 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -181,6 +181,12 @@ asmlinkage __visible void smp_reboot_interrupt(void)
irq_exit();
}
+static int register_stop_handler(void)
+{
+ return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop");
+}
+
static void native_stop_other_cpus(int wait)
{
unsigned long flags;
@@ -214,39 +220,41 @@ static void native_stop_other_cpus(int wait)
apic->send_IPI_allbutself(REBOOT_VECTOR);
/*
- * Don't wait longer than a second if the caller
- * didn't ask us to wait.
+ * Don't wait longer than a second for IPI completion. The
+ * wait request is not checked here because that would
+ * prevent an NMI shutdown attempt in case that not all
+ * CPUs reach shutdown state.
*/
timeout = USEC_PER_SEC;
- while (num_online_cpus() > 1 && (wait || timeout--))
+ while (num_online_cpus() > 1 && timeout--)
udelay(1);
}
-
- /* if the REBOOT_VECTOR didn't work, try with the NMI */
- if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) {
- if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
- NMI_FLAG_FIRST, "smp_stop"))
- /* Note: we ignore failures here */
- /* Hope the REBOOT_IRQ is good enough */
- goto finish;
-
- /* sync above data before sending IRQ */
- wmb();
- pr_emerg("Shutting down cpus with NMI\n");
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
+ if (num_online_cpus() > 1) {
+ /*
+ * If NMI IPI is enabled, try to register the stop handler
+ * and send the IPI. In any case try to wait for the other
+ * CPUs to stop.
+ */
+ if (!smp_no_nmi_ipi && !register_stop_handler()) {
+ /* Sync above data before sending IRQ */
+ wmb();
- apic->send_IPI_allbutself(NMI_VECTOR);
+ pr_emerg("Shutting down cpus with NMI\n");
+ apic->send_IPI_allbutself(NMI_VECTOR);
+ }
/*
- * Don't wait longer than a 10 ms if the caller
- * didn't ask us to wait.
+ * Don't wait longer than 10 ms if the caller didn't
+ * reqeust it. If wait is true, the machine hangs here if
+ * one or more CPUs do not reach shutdown state.
*/
timeout = USEC_PER_MSEC * 10;
while (num_online_cpus() > 1 && (wait || timeout--))
udelay(1);
}
-finish:
local_irq_save(flags);
disable_local_APIC();
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
index 623965e86b65..897da526e40e 100644
--- a/arch/x86/kernel/sysfb_efi.c
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -231,9 +231,55 @@ static const struct dmi_system_id efifb_dmi_system_table[] __initconst = {
{},
};
+/*
+ * Some devices have a portrait LCD but advertise a landscape resolution (and
+ * pitch). We simply swap width and height for these devices so that we can
+ * correctly deal with some of them coming with multiple resolutions.
+ */
+static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = {
+ {
+ /*
+ * Lenovo MIIX310-10ICR, only some batches have the troublesome
+ * 800x1280 portrait screen. Luckily the portrait version has
+ * its own BIOS version, so we match on that.
+ */
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "MIIX 310-10ICR"),
+ DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1HCN44WW"),
+ },
+ },
+ {
+ /* Lenovo MIIX 320-10ICR with 800x1280 portrait screen */
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_EXACT_MATCH(DMI_PRODUCT_VERSION,
+ "Lenovo MIIX 320-10ICR"),
+ },
+ },
+ {
+ /* Lenovo D330 with 800x1280 or 1200x1920 portrait screen */
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_EXACT_MATCH(DMI_PRODUCT_VERSION,
+ "Lenovo ideapad D330-10IGM"),
+ },
+ },
+ {},
+};
+
__init void sysfb_apply_efi_quirks(void)
{
if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI ||
!(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS))
dmi_check_system(efifb_dmi_system_table);
+
+ if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI &&
+ dmi_check_system(efifb_dmi_swap_width_height)) {
+ u16 temp = screen_info.lfb_width;
+
+ screen_info.lfb_width = screen_info.lfb_height;
+ screen_info.lfb_height = temp;
+ screen_info.lfb_linelength = 4 * screen_info.lfb_width;
+ }
}
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
index 85195d447a92..f3215346e47f 100644
--- a/arch/x86/kernel/sysfb_simplefb.c
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -94,11 +94,11 @@ __init int create_simplefb(const struct screen_info *si,
if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
size <<= 16;
length = mode->height * mode->stride;
- length = PAGE_ALIGN(length);
if (length > size) {
printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
return -EINVAL;
}
+ length = PAGE_ALIGN(length);
/* setup IORESOURCE_MEM as framebuffer memory */
memset(&res, 0, sizeof(res));
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9692a5e9fdab..b95693a73f12 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -4,6 +4,7 @@
#include <linux/user.h>
#include <linux/regset.h>
#include <linux/syscalls.h>
+#include <linux/nospec.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
@@ -219,6 +220,7 @@ int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *u_info)
{
struct user_desc info;
+ int index;
if (idx == -1 && get_user(idx, &u_info->entry_number))
return -EFAULT;
@@ -226,8 +228,11 @@ int do_get_thread_area(struct task_struct *p, int idx,
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
return -EINVAL;
- fill_user_desc(&info, idx,
- &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]);
+ index = idx - GDT_ENTRY_TLS_MIN;
+ index = array_index_nospec(index,
+ GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN + 1);
+
+ fill_user_desc(&info, idx, &p->thread.tls_array[index]);
if (copy_to_user(u_info, &info, sizeof(info)))
return -EFAULT;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 769c370011d6..cb768417429d 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -713,7 +713,7 @@ unsigned long native_calibrate_tsc(void)
case INTEL_FAM6_KABYLAKE_DESKTOP:
crystal_khz = 24000; /* 24.0 MHz */
break;
- case INTEL_FAM6_ATOM_DENVERTON:
+ case INTEL_FAM6_ATOM_GOLDMONT_X:
crystal_khz = 25000; /* 25.0 MHz */
break;
case INTEL_FAM6_ATOM_GOLDMONT:
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index a2456d4d286a..9b8b3cb2e934 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -6,6 +6,21 @@
#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+/*
+ * This disables KASAN checking when reading a value from another task's stack,
+ * since the other task could be running on another CPU and could have poisoned
+ * the stack in the meantime.
+ */
+#define READ_ONCE_TASK_STACK(task, x) \
+({ \
+ unsigned long val; \
+ if (task == current) \
+ val = READ_ONCE(x); \
+ else \
+ val = READ_ONCE_NOCHECK(x); \
+ val; \
+})
+
unsigned long unwind_get_return_address(struct unwind_state *state)
{
unsigned long addr;
@@ -14,7 +29,8 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
if (unwind_done(state))
return 0;
- addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
+ addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
+ addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr,
addr_p);
return __kernel_text_address(addr) ? addr : 0;
@@ -48,7 +64,7 @@ bool unwind_next_frame(struct unwind_state *state)
if (unwind_done(state))
return false;
- next_bp = (unsigned long *)*state->bp;
+ next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp);
/* make sure the next frame's data is accessible */
if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index e78a6b1db74b..73391c1bd2a9 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -296,6 +296,10 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
if (is_prefix_bad(insn))
return -ENOTSUPP;
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return -ENOTSUPP;
+
if (x86_64)
good_insns = good_insns_64;
else
@@ -514,9 +518,12 @@ struct uprobe_xol_ops {
void (*abort)(struct arch_uprobe *, struct pt_regs *);
};
-static inline int sizeof_long(void)
+static inline int sizeof_long(struct pt_regs *regs)
{
- return in_ia32_syscall() ? 4 : 8;
+ /*
+ * Check registers for mode as in_xxx_syscall() does not apply here.
+ */
+ return user_64bit_mode(regs) ? 8 : 4;
}
static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
@@ -527,9 +534,9 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
static int push_ret_address(struct pt_regs *regs, unsigned long ip)
{
- unsigned long new_sp = regs->sp - sizeof_long();
+ unsigned long new_sp = regs->sp - sizeof_long(regs);
- if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
+ if (copy_to_user((void __user *)new_sp, &ip, sizeof_long(regs)))
return -EFAULT;
regs->sp = new_sp;
@@ -562,7 +569,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
long correction = utask->vaddr - utask->xol_vaddr;
regs->ip += correction;
} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
- regs->sp += sizeof_long(); /* Pop incorrect return address */
+ regs->sp += sizeof_long(regs); /* Pop incorrect return address */
if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
return -ERESTART;
}
@@ -671,7 +678,7 @@ static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
* "call" insn was executed out-of-line. Just restore ->sp and restart.
* We could also restore ->ip and try to call branch_emulate_op() again.
*/
- regs->sp += sizeof_long();
+ regs->sp += sizeof_long(regs);
return -ERESTART;
}
@@ -962,7 +969,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
{
- int rasize = sizeof_long(), nleft;
+ int rasize = sizeof_long(regs), nleft;
unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
@@ -980,7 +987,7 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
"%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
- force_sig_info(SIGSEGV, SEND_SIG_FORCED, current);
+ force_sig(SIGSEGV, current);
}
return -1;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e783a5daaab2..55f04875293f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -367,7 +367,7 @@ SECTIONS
* Per-cpu symbols which need to be offset from __per_cpu_load
* for the boot processor.
*/
-#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
+#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
INIT_PER_CPU(gdt_page);
INIT_PER_CPU(irq_stack_union);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c17d3893ae60..63c3ff9e74d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -279,13 +279,18 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
{
switch (func) {
case 0:
- entry->eax = 1; /* only one leaf currently */
+ entry->eax = 7;
++*nent;
break;
case 1:
entry->ecx = F(MOVBE);
++*nent;
break;
+ case 7:
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ if (index == 0)
+ entry->ecx = F(RDPID);
+ ++*nent;
default:
break;
}
@@ -355,7 +360,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
- F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD);
+ F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
+ F(AMD_SSB_NO) | F(AMD_STIBP);
/* cpuid 0xC0000001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -380,14 +386,15 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
- F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES);
+ F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) |
+ F(INTEL_STIBP) | F(MD_CLEAR);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
r = -E2BIG;
- if (*nent >= maxnent)
+ if (WARN_ON(*nent >= maxnent))
goto out;
do_cpuid_1_ent(entry, function, index);
@@ -464,8 +471,17 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
+
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
cpuid_mask(&entry->edx, CPUID_7_EDX);
+ if (boot_cpu_has(X86_FEATURE_IBPB) &&
+ boot_cpu_has(X86_FEATURE_IBRS))
+ entry->edx |= F(SPEC_CTRL);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ entry->edx |= F(INTEL_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ entry->edx |= F(SPEC_CTRL_SSBD);
/*
* We emulate ARCH_CAPABILITIES in software even
* if the host doesn't support it.
@@ -633,7 +649,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ebx |= F(VIRT_SSBD);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
- if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ /*
+ * The preference is to use SPEC CTRL MSR instead of the
+ * VIRT_SPEC MSR.
+ */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
entry->ebx |= F(VIRT_SSBD);
break;
}
@@ -676,6 +697,9 @@ out:
static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func,
u32 idx, int *nent, int maxnent, unsigned int type)
{
+ if (*nent >= maxnent)
+ return -E2BIG;
+
if (type == KVM_GET_EMULATED_CPUID)
return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 8a841b9d8f84..b2bf8e1d5782 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -176,7 +176,7 @@ static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
- if (best && (best->ebx & bit(X86_FEATURE_AMD_IBRS)))
+ if (best && (best->ebx & (bit(X86_FEATURE_AMD_IBRS | bit(X86_FEATURE_AMD_SSBD)))))
return true;
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD)));
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 510cfc06701a..da3cd734dee1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -21,6 +21,7 @@
*/
#include <linux/kvm_host.h>
+#include <linux/nospec.h>
#include "kvm_cache_regs.h"
#include <asm/kvm_emulate.h>
#include <linux/stringify.h>
@@ -2579,15 +2580,13 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
* CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
* supports long mode.
*/
- cr4 = ctxt->ops->get_cr(ctxt, 4);
if (emulator_has_longmode(ctxt)) {
struct desc_struct cs_desc;
/* Zero CR4.PCIDE before CR0.PG. */
- if (cr4 & X86_CR4_PCIDE) {
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PCIDE)
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
- cr4 &= ~X86_CR4_PCIDE;
- }
/* A 32-bit code segment is required to clear EFER.LMA. */
memset(&cs_desc, 0, sizeof(cs_desc));
@@ -2601,13 +2600,16 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
if (cr0 & X86_CR0_PE)
ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
- /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */
- if (cr4 & X86_CR4_PAE)
- ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+ if (emulator_has_longmode(ctxt)) {
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PAE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
- /* And finally go back to 32-bit mode. */
- efer = 0;
- ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+ }
smbase = ctxt->ops->get_smbase(ctxt);
if (emulator_has_longmode(ctxt))
@@ -3529,6 +3531,16 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc_aux = 0;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+ return emulate_gp(ctxt, 0);
+ ctxt->dst.val = tsc_aux;
+ return X86EMUL_CONTINUE;
+}
+
static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
{
u64 tsc = 0;
@@ -4389,10 +4401,20 @@ static const struct opcode group8[] = {
F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
};
+/*
+ * The "memory" destination is actually always a register, since we come
+ * from the register case of group9.
+ */
+static const struct gprefix pfx_0f_c7_7 = {
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+};
+
+
static const struct group_dual group9 = { {
N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
}, {
- N, N, N, N, N, N, N, N,
+ N, N, N, N, N, N, N,
+ GP(0, &pfx_0f_c7_7),
} };
static const struct opcode group11[] = {
@@ -5000,6 +5022,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
ctxt->fetch.ptr = ctxt->fetch.data;
ctxt->fetch.end = ctxt->fetch.data + insn_len;
ctxt->opcode_len = 1;
+ ctxt->intercept = x86_intercept_none;
if (insn_len > 0)
memcpy(ctxt->fetch.data, insn, insn_len);
else {
@@ -5052,16 +5075,28 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
ctxt->ad_bytes = def_ad_bytes ^ 6;
break;
case 0x26: /* ES override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_ES;
+ break;
case 0x2e: /* CS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_CS;
+ break;
case 0x36: /* SS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_SS;
+ break;
case 0x3e: /* DS override */
has_seg_override = true;
- ctxt->seg_override = (ctxt->b >> 3) & 3;
+ ctxt->seg_override = VCPU_SREG_DS;
break;
case 0x64: /* FS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_FS;
+ break;
case 0x65: /* GS override */
has_seg_override = true;
- ctxt->seg_override = ctxt->b & 7;
+ ctxt->seg_override = VCPU_SREG_GS;
break;
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
@@ -5145,10 +5180,15 @@ done_prefixes:
}
break;
case Escape:
- if (ctxt->modrm > 0xbf)
- opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
- else
+ if (ctxt->modrm > 0xbf) {
+ size_t size = ARRAY_SIZE(opcode.u.esc->high);
+ u32 index = array_index_nospec(
+ ctxt->modrm - 0xc0, size);
+
+ opcode = opcode.u.esc->high[index];
+ } else {
opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ }
break;
case InstrDual:
if ((ctxt->modrm >> 6) == 3)
@@ -5256,6 +5296,8 @@ done_prefixes:
ctxt->memopp->addr.mem.ea + ctxt->_eip);
done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 42b1c83741c8..5e837c96e93f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -28,6 +28,7 @@
#include <linux/kvm_host.h>
#include <linux/highmem.h>
+#include <linux/nospec.h>
#include <asm/apicdef.h>
#include <trace/events/kvm.h>
@@ -719,11 +720,12 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
u32 index, u64 *pdata)
{
struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ if (WARN_ON_ONCE(index >= size))
return -EINVAL;
- *pdata = hv->hv_crash_param[index];
+ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
return 0;
}
@@ -762,11 +764,12 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
u32 index, u64 data)
{
struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ if (WARN_ON_ONCE(index >= size))
return -EINVAL;
- hv->hv_crash_param[index] = data;
+ hv->hv_crash_param[array_index_nospec(index, size)] = data;
return 0;
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 5f810bb80802..aa34b16e62c2 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -36,6 +36,7 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/nospec.h>
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
@@ -73,13 +74,14 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
default:
{
u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
- u64 redir_content;
+ u64 redir_content = ~0ULL;
- if (redir_index < IOAPIC_NUM_PINS)
- redir_content =
- ioapic->redirtbl[redir_index].bits;
- else
- redir_content = ~0ULL;
+ if (redir_index < IOAPIC_NUM_PINS) {
+ u32 index = array_index_nospec(
+ redir_index, IOAPIC_NUM_PINS);
+
+ redir_content = ioapic->redirtbl[index].bits;
+ }
result = (ioapic->ioregsel & 0x1) ?
(redir_content >> 32) & 0xffffffff :
@@ -299,6 +301,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
ioapic_debug("change redir index %x val %x\n", index, val);
if (index >= IOAPIC_NUM_PINS)
return;
+ index = array_index_nospec(index, IOAPIC_NUM_PINS);
e = &ioapic->redirtbl[index];
mask_before = e->fields.mask;
/* Preserve read-only fields */
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6c0191615f23..cf8b3c17657a 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -436,7 +436,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
kvm_set_msi_irq(vcpu->kvm, entry, &irq);
- if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+ if (irq.trig_mode && kvm_apic_match_dest(vcpu, NULL, 0,
irq.dest_id, irq.dest_mode))
__set_bit(irq.vector, ioapic_handled_vectors);
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c8630569e392..3988e26af3b5 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -28,6 +28,7 @@
#include <linux/export.h>
#include <linux/math64.h>
#include <linux/slab.h>
+#include <linux/nospec.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/page.h>
@@ -531,9 +532,11 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
{
u8 val;
- if (pv_eoi_get_user(vcpu, &val) < 0)
+ if (pv_eoi_get_user(vcpu, &val) < 0) {
apic_debug("Can't read EOI MSR value: 0x%llx\n",
(unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ return false;
+ }
return val & 0x1;
}
@@ -1587,15 +1590,20 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
- case APIC_LVTERR:
+ case APIC_LVTERR: {
/* TODO: Check vector */
+ size_t size;
+ u32 index;
+
if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
-
- val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
+ size = ARRAY_SIZE(apic_lvt_mask);
+ index = array_index_nospec(
+ (reg - APIC_LVTT) >> 4, size);
+ val &= apic_lvt_mask[index];
kvm_lapic_set_reg(apic, reg, val);
-
break;
+ }
case APIC_LVTT:
if (!kvm_apic_sw_enabled(apic))
@@ -1992,7 +2000,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
- if (!apic_enabled(apic))
+ if (!kvm_apic_hw_enabled(apic))
return -1;
apic_update_ppr(apic);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 676edfc19a95..3a281a2decde 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,7 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/kthread.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -44,6 +45,30 @@
#include <asm/vmx.h>
#include <asm/kvm_page_track.h>
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+ .set = set_nx_huge_pages,
+ .get = param_get_bool,
+};
+
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+ .set = set_nx_huge_pages_recovery_ratio,
+ .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+ &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+
/*
* When setting this variable to true it enables Two-Dimensional-Paging
* where the hardware walks 2 page tables:
@@ -131,9 +156,6 @@ module_param(dbg, bool, 0644);
#include <trace/events/kvm.h>
-#define CREATE_TRACE_POINTS
-#include "mmutrace.h"
-
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
@@ -142,6 +164,20 @@ module_param(dbg, bool, 0644);
/* make pte_list_desc fit well in cache line */
#define PTE_LIST_EXT 3
+/*
+ * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ *
+ * For handle_mmio_page_fault only:
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ */
+enum {
+ RET_PF_RETRY = 0,
+ RET_PF_EMULATE = 1,
+ RET_PF_INVALID = 2,
+};
+
struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
@@ -179,14 +215,23 @@ static u64 __read_mostly shadow_mmio_mask;
static u64 __read_mostly shadow_present_mask;
static void mmu_spte_set(u64 *sptep, u64 spte);
+static bool is_executable_pte(u64 spte);
static void mmu_free_roots(struct kvm_vcpu *vcpu);
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
{
shadow_mmio_mask = mmio_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+static bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
+}
+
/*
* the low bit of the generation number is always presumed to be zero.
* This disables mmio caching during memslot updates. The concept is
@@ -324,6 +369,11 @@ static int is_last_spte(u64 pte, int level)
return 0;
}
+static bool is_executable_pte(u64 spte)
+{
+ return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
static kvm_pfn_t spte_to_pfn(u64 pte)
{
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -767,10 +817,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
{
- if (sp->role.direct)
- BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
- else
+ if (!sp->role.direct) {
sp->gfns[index] = gfn;
+ return;
+ }
+
+ if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
+ pr_err_ratelimited("gfn mismatch under direct page %llx "
+ "(expected %llx, got %llx)\n",
+ sp->gfn,
+ kvm_mmu_page_get_gfn(sp, index), gfn);
}
/*
@@ -829,6 +885,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_disallow_lpage(slot, gfn);
}
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (sp->lpage_disallowed)
+ return;
+
+ ++kvm->stat.nx_lpage_splits;
+ list_add_tail(&sp->lpage_disallowed_link,
+ &kvm->arch.lpage_disallowed_mmu_pages);
+ sp->lpage_disallowed = true;
+}
+
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
@@ -846,6 +913,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ --kvm->stat.nx_lpage_splits;
+ sp->lpage_disallowed = false;
+ list_del(&sp->lpage_disallowed_link);
+}
+
static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
@@ -2382,6 +2456,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
kvm_reload_remote_mmus(kvm);
}
+ if (sp->lpage_disallowed)
+ unaccount_huge_nx_page(kvm, sp);
+
sp->role.invalid = 1;
return ret;
}
@@ -2533,6 +2610,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (!speculative)
spte |= shadow_accessed_mask;
+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled()) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
@@ -2598,13 +2680,13 @@ done:
return ret;
}
-static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
- bool speculative, bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
- bool emulate = false;
+ int ret = RET_PF_RETRY;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2634,18 +2716,15 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
- emulate = true;
+ ret = RET_PF_EMULATE;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (unlikely(is_mmio_spte(*sptep)))
- emulate = true;
+ ret = RET_PF_EMULATE;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
- is_large_pte(*sptep)? "2MB" : "4kB",
- *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
- *sptep, sptep);
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
@@ -2657,9 +2736,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
}
}
- kvm_release_pfn_clean(pfn);
-
- return emulate;
+ return ret;
}
static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2693,9 +2770,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
if (ret <= 0)
return -1;
- for (i = 0; i < ret; i++, gfn++, start++)
+ for (i = 0; i < ret; i++, gfn++, start++) {
mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);
+ put_page(pages[i]);
+ }
return 0;
}
@@ -2743,40 +2822,71 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
- int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
{
- struct kvm_shadow_walk_iterator iterator;
+ int level = *levelp;
+ u64 spte = *it.sptep;
+
+ if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+ is_nx_huge_page_enabled() &&
+ is_shadow_present_pte(spte) &&
+ !is_large_pte(spte)) {
+ /*
+ * A small SPTE exists for this pfn, but FNAME(fetch)
+ * and __direct_map would like to create a large PTE
+ * instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of
+ * the address.
+ */
+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+ *pfnp |= gfn & page_mask;
+ (*levelp)--;
+ }
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+ int map_writable, int level, kvm_pfn_t pfn,
+ bool prefault, bool lpage_disallowed)
+{
+ struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int emulate = 0;
- gfn_t pseudo_gfn;
+ int ret;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ gfn_t base_gfn = gfn;
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return 0;
+ return RET_PF_RETRY;
- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
- if (iterator.level == level) {
- emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
- write, level, gfn, pfn, prefault,
- map_writable);
- direct_pte_prefetch(vcpu, iterator.sptep);
- ++vcpu->stat.pf_fixed;
- break;
- }
+ trace_kvm_mmu_spte_requested(gpa, level, pfn);
+ for_each_shadow_entry(vcpu, gpa, it) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
- drop_large_spte(vcpu, iterator.sptep);
- if (!is_shadow_present_pte(*iterator.sptep)) {
- u64 base_addr = iterator.addr;
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == level)
+ break;
- base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
- pseudo_gfn = base_addr >> PAGE_SHIFT;
- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
- iterator.level - 1, 1, ACC_ALL);
+ drop_large_spte(vcpu, it.sptep);
+ if (!is_shadow_present_pte(*it.sptep)) {
+ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+ it.level - 1, true, ACC_ALL);
- link_shadow_page(vcpu, iterator.sptep, sp);
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
}
}
- return emulate;
+
+ ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
+ write, level, base_gfn, pfn, prefault,
+ map_writable);
+ direct_pte_prefetch(vcpu, it.sptep);
+ ++vcpu->stat.pf_fixed;
+ return ret;
}
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2798,25 +2908,23 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
* caused mmio page fault and treat it as mmio access.
- * Return 1 to tell kvm to emulate it.
*/
if (pfn == KVM_PFN_ERR_RO_FAULT)
- return 1;
+ return RET_PF_EMULATE;
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
- return 0;
+ return RET_PF_RETRY;
}
return -EFAULT;
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
- gfn_t *gfnp, kvm_pfn_t *pfnp,
+ gfn_t gfn, kvm_pfn_t *pfnp,
int *levelp)
{
kvm_pfn_t pfn = *pfnp;
- gfn_t gfn = *gfnp;
int level = *levelp;
/*
@@ -2826,7 +2934,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
* here.
*/
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
- level == PT_PAGE_TABLE_LEVEL &&
+ !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
PageTransCompoundMap(pfn_to_page(pfn)) &&
!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
@@ -2843,8 +2951,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
mask = KVM_PAGES_PER_HPAGE(level) - 1;
VM_BUG_ON((gfn & mask) != (pfn & mask));
if (pfn & mask) {
- gfn &= ~mask;
- *gfnp = gfn;
kvm_release_pfn_clean(pfn);
pfn &= ~mask;
kvm_get_pfn(pfn);
@@ -3012,11 +3118,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
{
int r;
int level;
- bool force_pt_level = false;
+ bool force_pt_level;
kvm_pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, write = error_code & PFERR_WRITE_MASK;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ force_pt_level = lpage_disallowed;
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
/*
@@ -3031,32 +3140,30 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
}
if (fast_page_fault(vcpu, v, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
return r;
+ r = RET_PF_RETRY;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
-
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+ r = __direct_map(vcpu, v, write, map_writable, level, pfn,
+ prefault, false);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return r;
}
@@ -3383,38 +3490,38 @@ exit:
return reserved;
}
-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
if (mmio_info_in_cache(vcpu, addr, direct))
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
if (WARN_ON(reserved))
- return RET_MMIO_PF_BUG;
+ return -EINVAL;
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
if (!check_mmio_spte(vcpu, spte))
- return RET_MMIO_PF_INVALID;
+ return RET_PF_INVALID;
if (direct)
addr = 0;
trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
}
/*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
- return RET_MMIO_PF_RETRY;
+ return RET_PF_RETRY;
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
@@ -3464,7 +3571,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3548,18 +3655,21 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
bool map_writable;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
- PT_DIRECTORY_LEVEL);
+ force_pt_level =
+ lpage_disallowed ||
+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
if (level > PT_DIRECTORY_LEVEL &&
@@ -3569,32 +3679,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
}
if (fast_page_fault(vcpu, gpa, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
return r;
+ r = RET_PF_RETRY;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
-
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+ prefault, lpage_disallowed);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return r;
}
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4510,23 +4618,24 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
enum emulation_result er;
bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+ r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, cr2, direct);
- if (r == RET_MMIO_PF_EMULATE) {
+ if (r == RET_PF_EMULATE) {
emulation_type = 0;
goto emulate;
}
- if (r == RET_MMIO_PF_RETRY)
- return 1;
- if (r < 0)
- return r;
}
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+ if (r == RET_PF_INVALID) {
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+ WARN_ON(r == RET_PF_INVALID);
+ }
+
+ if (r == RET_PF_RETRY)
+ return 1;
if (r < 0)
return r;
- if (!r)
- return 1;
if (mmio_info_in_cache(vcpu, cr2, direct))
emulation_type = 0;
@@ -4781,9 +4890,9 @@ restart:
* the guest, and the guest page table is using 4K page size
* mapping if the indirect sp has level = 1.
*/
- if (sp->role.direct &&
- !kvm_is_reserved_pfn(pfn) &&
- PageTransCompoundMap(pfn_to_page(pfn))) {
+ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+ !kvm_is_zone_device_pfn(pfn) &&
+ PageTransCompoundMap(pfn_to_page(pfn))) {
drop_spte(kvm, sptep);
need_tlb_flush = 1;
goto restart;
@@ -4965,7 +5074,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
int nr_to_scan = sc->nr_to_scan;
unsigned long freed = 0;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
int idx;
@@ -5015,7 +5124,7 @@ unlock:
break;
}
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
return freed;
}
@@ -5039,8 +5148,58 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+static bool get_nx_auto_mode(void)
+{
+ /* Return true when CPU has the bug, and mitigations are ON */
+ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+ bool old_val = nx_huge_pages;
+ bool new_val;
+
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
+ if (sysfs_streq(val, "off"))
+ new_val = 0;
+ else if (sysfs_streq(val, "force"))
+ new_val = 1;
+ else if (sysfs_streq(val, "auto"))
+ new_val = get_nx_auto_mode();
+ else if (strtobool(val, &new_val) < 0)
+ return -EINVAL;
+
+ __set_nx_huge_pages(new_val);
+
+ if (new_val != old_val) {
+ struct kvm *kvm;
+ int idx;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ idx = srcu_read_lock(&kvm->srcu);
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ }
+ mutex_unlock(&kvm_lock);
+ }
+
+ return 0;
+}
+
int kvm_mmu_module_init(void)
{
+ if (nx_huge_pages == -1)
+ __set_nx_huge_pages(get_nx_auto_mode());
+
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, SLAB_ACCOUNT, NULL);
@@ -5104,3 +5263,116 @@ void kvm_mmu_module_exit(void)
unregister_shrinker(&mmu_shrinker);
mmu_audit_disable();
}
+
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+{
+ unsigned int old_val;
+ int err;
+
+ old_val = nx_huge_pages_recovery_ratio;
+ err = param_set_uint(val, kp);
+ if (err)
+ return err;
+
+ if (READ_ONCE(nx_huge_pages) &&
+ !old_val && nx_huge_pages_recovery_ratio) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+
+ mutex_unlock(&kvm_lock);
+ }
+
+ return err;
+}
+
+static void kvm_recover_nx_lpages(struct kvm *kvm)
+{
+ int rcu_idx;
+ struct kvm_mmu_page *sp;
+ unsigned int ratio;
+ LIST_HEAD(invalid_list);
+ ulong to_zap;
+
+ rcu_idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+ /*
+ * We use a separate list instead of just using active_mmu_pages
+ * because the number of lpage_disallowed pages is expected to
+ * be relatively small compared to the total.
+ */
+ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ struct kvm_mmu_page,
+ lpage_disallowed_link);
+ WARN_ON_ONCE(!sp->lpage_disallowed);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ WARN_ON_ONCE(sp->lpage_disallowed);
+
+ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ if (to_zap)
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_lpage_recovery_timeout(u64 start_time)
+{
+ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
+ ? start_time + 60 * HZ - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+ u64 start_time;
+ long remaining_time;
+
+ while (true) {
+ start_time = get_jiffies_64();
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop() && remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (kthread_should_stop())
+ return 0;
+
+ kvm_recover_nx_lpages(kvm);
+ }
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ int err;
+
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ "kvm-nx-lpage-recovery",
+ &kvm->arch.nx_lpage_recovery_thread);
+ if (!err)
+ kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+
+ return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.nx_lpage_recovery_thread)
+ kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c92834c55c59..e584689e7d46 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -56,23 +56,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-/*
- * Return values of handle_mmio_page_fault:
- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
- * directly.
- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
- * fault path update the mmio spte.
- * RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
- */
-enum {
- RET_MMIO_PF_EMULATE = 1,
- RET_MMIO_PF_INVALID = 2,
- RET_MMIO_PF_RETRY = 0,
- RET_MMIO_PF_BUG = -1
-};
-
-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
@@ -202,4 +185,8 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn);
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
#endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 5a24b846a1cb..756b14ecc957 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -322,6 +322,65 @@ TRACE_EVENT(
__entry->kvm_gen == __entry->spte_gen
)
);
+
+TRACE_EVENT(
+ kvm_mmu_set_spte,
+ TP_PROTO(int level, gfn_t gfn, u64 *sptep),
+ TP_ARGS(level, gfn, sptep),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(u64, sptep)
+ __field(u8, level)
+ /* These depend on page entry type, so compute them now. */
+ __field(bool, r)
+ __field(bool, x)
+ __field(u8, u)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = *sptep;
+ __entry->sptep = virt_to_phys(sptep);
+ __entry->level = level;
+ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
+ __entry->x = is_executable_pte(__entry->spte);
+ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+ ),
+
+ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+ __entry->gfn, __entry->spte,
+ __entry->r ? "r" : "-",
+ __entry->spte & PT_PRESENT_MASK ? "w" : "-",
+ __entry->x ? "x" : "-",
+ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+ __entry->level, __entry->sptep
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_spte_requested,
+ TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),
+ TP_ARGS(addr, level, pfn),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, pfn)
+ __field(u8, level)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = addr >> PAGE_SHIFT;
+ __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
+ __entry->level = level;
+ ),
+
+ TP_printk("gfn %llx pfn %llx level %d",
+ __entry->gfn, __entry->pfn, __entry->level
+ )
+);
+
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 0149ac59c273..3e3016411020 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -17,6 +17,7 @@
*/
#include <linux/kvm_host.h>
+#include <linux/nospec.h>
#include <asm/mtrr.h>
#include "cpuid.h"
@@ -202,11 +203,15 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
break;
case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
*seg = 1;
- *unit = msr - MSR_MTRRfix16K_80000;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix16K_80000,
+ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
break;
case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
*seg = 2;
- *unit = msr - MSR_MTRRfix4K_C0000;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix4K_C0000,
+ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
break;
default:
return false;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 37363900297d..e03225e707b2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -499,6 +499,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
true, true);
+ kvm_release_pfn_clean(pfn);
return true;
}
@@ -572,12 +573,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int write_fault, int hlevel,
- kvm_pfn_t pfn, bool map_writable, bool prefault)
+ kvm_pfn_t pfn, bool map_writable, bool prefault,
+ bool lpage_disallowed)
{
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, emulate;
+ int top_level, ret;
+ gfn_t gfn, base_gfn;
direct_access = gw->pte_access;
@@ -622,36 +625,49 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
- for (;
- shadow_walk_okay(&it) && it.level > hlevel;
- shadow_walk_next(&it)) {
- gfn_t direct_gfn;
+ /*
+ * FNAME(page_fault) might have clobbered the bottom bits of
+ * gw->gfn, restore them from the virtual address.
+ */
+ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
+ base_gfn = gfn;
+ trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
+
+ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
clear_sp_write_flooding_count(it.sptep);
- validate_direct_spte(vcpu, it.sptep, direct_access);
- drop_large_spte(vcpu, it.sptep);
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
- if (is_shadow_present_pte(*it.sptep))
- continue;
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == hlevel)
+ break;
- direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ validate_direct_spte(vcpu, it.sptep, direct_access);
- sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
- true, direct_access);
- link_shadow_page(vcpu, it.sptep, sp);
+ drop_large_spte(vcpu, it.sptep);
+
+ if (!is_shadow_present_pte(*it.sptep)) {
+ sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
+ it.level - 1, true, direct_access);
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
+ }
}
- clear_sp_write_flooding_count(it.sptep);
- emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
- it.level, gw->gfn, pfn, prefault, map_writable);
+ ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
+ it.level, base_gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
-
- return emulate;
+ ++vcpu->stat.pf_fixed;
+ return ret;
out_gpte_changed:
- kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
/*
@@ -717,9 +733,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int r;
kvm_pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
- bool force_pt_level = false;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ bool force_pt_level = lpage_disallowed;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -746,12 +764,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!prefault)
inject_page_fault(vcpu, &walker.fault);
- return 0;
+ return RET_PF_RETRY;
}
if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
shadow_page_table_clear_flood(vcpu, addr);
- return 1;
+ return RET_PF_EMULATE;
}
vcpu->arch.write_fault_to_shadow_pgtable = false;
@@ -773,7 +791,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
&map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
walker.gfn, pfn, walker.pte_access, &r))
@@ -799,6 +817,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
walker.pte_access &= ~ACC_EXEC_MASK;
}
+ r = RET_PF_RETRY;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
@@ -806,19 +825,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
make_mmu_pages_available(vcpu);
if (!force_pt_level)
- transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
+ transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
- level, pfn, map_writable, prefault);
- ++vcpu->stat.pf_fixed;
+ level, pfn, map_writable, prefault, lpage_disallowed);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return r;
}
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 06ce377dcbc9..0827ee7d0e9b 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -124,8 +124,8 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
intr ? kvm_perf_overflow_intr :
kvm_perf_overflow, pmc);
if (IS_ERR(event)) {
- printk_once("kvm_pmu: event creation failed %ld\n",
- PTR_ERR(event));
+ pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
+ PTR_ERR(event), pmc->idx);
return;
}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index f96e1f962587..fbf3d25af765 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -1,6 +1,8 @@
#ifndef __KVM_X86_PMU_H
#define __KVM_X86_PMU_H
+#include <linux/nospec.h>
+
#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
@@ -80,8 +82,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
u32 base)
{
- if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
- return &pmu->gp_counters[msr - base];
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_gp_counters);
+
+ return &pmu->gp_counters[index];
+ }
return NULL;
}
@@ -91,8 +97,12 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
{
int base = MSR_CORE_PERF_FIXED_CTR0;
- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
- return &pmu->fixed_counters[msr - base];
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_fixed_counters);
+
+ return &pmu->fixed_counters[index];
+ }
return NULL;
}
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index 5ab4a364348e..84ae4dd261ca 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -87,10 +87,14 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
static unsigned intel_find_fixed_event(int idx)
{
- if (idx >= ARRAY_SIZE(fixed_pmc_events))
+ u32 event;
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+
+ if (idx >= size)
return PERF_COUNT_HW_MAX;
- return intel_arch_events[fixed_pmc_events[idx]].event_type;
+ event = fixed_pmc_events[array_index_nospec(idx, size)];
+ return intel_arch_events[event].event_type;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -131,15 +135,19 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
struct kvm_pmc *counters;
+ unsigned int num_counters;
idx &= ~(3u << 30);
- if (!fixed && idx >= pmu->nr_arch_gp_counters)
- return NULL;
- if (fixed && idx >= pmu->nr_arch_fixed_counters)
+ if (fixed) {
+ counters = pmu->fixed_counters;
+ num_counters = pmu->nr_arch_fixed_counters;
+ } else {
+ counters = pmu->gp_counters;
+ num_counters = pmu->nr_arch_gp_counters;
+ }
+ if (idx >= num_counters)
return NULL;
- counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
-
- return &counters[idx];
+ return &counters[array_index_nospec(idx, num_counters)];
}
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
@@ -235,11 +243,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
break;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
- (pmc = get_fixed_pmc(pmu, msr))) {
- if (!msr_info->host_initiated)
- data = (s64)(s32)data;
- pmc->counter += data - pmc_read_counter(pmc);
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ if (msr_info->host_initiated)
+ pmc->counter = data;
+ else
+ pmc->counter = (s32)data;
+ return 0;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ pmc->counter = data;
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 01eb0451b96d..1079228e4fef 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -590,8 +590,14 @@ static int get_npt_level(void)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
vcpu->arch.efer = efer;
- if (!npt_enabled && !(efer & EFER_LMA))
- efer &= ~EFER_LME;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
@@ -1518,7 +1524,11 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
- if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
+ /*
+ * Since the host physical APIC id is 8 bits,
+ * we can support host APIC ID upto 255.
+ */
+ if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
@@ -3704,7 +3714,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
return 1;
/* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
return 1;
svm->spec_ctrl = data;
@@ -3940,14 +3950,25 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
kvm_lapic_reg_write(apic, APIC_ICR, icrl);
break;
case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm *kvm = svm->vcpu.kvm;
struct kvm_lapic *apic = svm->vcpu.arch.apic;
/*
- * Update ICR high and low, then emulate sending IPI,
- * which is handled when writing APIC_ICR.
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
*/
- kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
- kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ bool m = kvm_apic_match_dest(vcpu, apic,
+ icrl & KVM_APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & KVM_APIC_DEST_MASK);
+
+ if (m && !avic_vcpu_is_running(vcpu))
+ kvm_vcpu_wake_up(vcpu);
+ }
break;
}
case AVIC_IPI_FAILURE_INVALID_TARGET:
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0a6cc6754ec5..ea618b713b6f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -434,13 +434,13 @@ TRACE_EVENT(kvm_apic_ipi,
);
TRACE_EVENT(kvm_apic_accept_irq,
- TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec),
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
TP_ARGS(apicid, dm, tm, vec),
TP_STRUCT__entry(
__field( __u32, apicid )
__field( __u16, dm )
- __field( __u8, tm )
+ __field( __u16, tm )
__field( __u8, vec )
),
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a34fb7284024..2ad59d8553a5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -714,7 +714,6 @@ struct vcpu_vmx {
u64 msr_guest_kernel_gs_base;
#endif
- u64 arch_capabilities;
u64 spec_ctrl;
u32 vm_entry_controls_shadow;
@@ -1548,7 +1547,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
return -1;
}
-static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
{
struct {
u64 vpid : 16;
@@ -1562,7 +1561,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
: : "a"(&operand), "c"(ext) : "cc", "memory");
}
-static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
{
struct {
u64 eptp, gpa;
@@ -1620,43 +1619,15 @@ static void vmcs_load(struct vmcs *vmcs)
}
#ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
- cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
- cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
- return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
static void crash_vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
- if (!crash_local_vmclear_enabled(cpu))
- return;
-
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
vmcs_clear(v->vmcs);
}
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
@@ -1668,19 +1639,24 @@ static void __loaded_vmcs_clear(void *arg)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- crash_disable_local_vmclear(cpu);
+
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
/*
- * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
- * is before setting loaded_vmcs->vcpu to -1 which is done in
- * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
- * then adds the vmcs into percpu list before it is deleted.
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->vcpu to
+ * -1, otherwise a different cpu can see vcpu == -1 first and add
+ * loaded_vmcs to its percpu list before it's deleted from this cpu's
+ * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
- loaded_vmcs_init(loaded_vmcs);
- crash_enable_local_vmclear(cpu);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
}
static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -2220,17 +2196,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
- if (!enable_ept) {
- /*
- * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
- * host CPUID is more efficient than testing guest CPUID
- * or CR4. Host SMEP is anyway a requirement for guest SMEP.
- */
- if (boot_cpu_has(X86_FEATURE_SMEP))
- guest_efer |= EFER_NX;
- else if (!(guest_efer & EFER_NX))
- ignore_bits |= EFER_NX;
- }
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
/*
* LMA and LME handled by hardware; SCE meaningless outside long mode.
@@ -2480,18 +2448,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (!already_loaded) {
local_irq_disable();
- crash_disable_local_vmclear(cpu);
/*
- * Read loaded_vmcs->cpu should be before fetching
- * loaded_vmcs->loaded_vmcss_on_cpu_link.
- * See the comments in __loaded_vmcs_clear().
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
*/
smp_rmb();
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
- crash_enable_local_vmclear(cpu);
local_irq_enable();
}
@@ -3209,12 +3176,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = to_vmx(vcpu)->spec_ctrl;
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has_arch_capabilities(vcpu))
- return 1;
- msr_info->data = to_vmx(vcpu)->arch_capabilities;
- break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3376,11 +3337,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
MSR_TYPE_W);
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated)
- return 1;
- vmx->arch_capabilities = data;
- break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -3526,21 +3482,6 @@ static int hardware_enable(void)
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-
- /*
- * Now we can enable the vmclear operation in kdump
- * since the loaded_vmcss_on_cpu list on this cpu
- * has been initialized.
- *
- * Though the cpu is not in VMX operation now, there
- * is no problem to enable the vmclear operation
- * for the loaded_vmcss_on_cpu list is empty!
- */
- crash_enable_local_vmclear(cpu);
-
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
@@ -4661,6 +4602,26 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
(ss.selector & SEGMENT_RPL_MASK));
}
+static bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu,
+ unsigned int port, int size);
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification;
+ unsigned short port;
+ int size;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
/*
* Check if guest state is valid. Returns true if valid, false if
* not.
@@ -5468,8 +5429,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
- vmx->arch_capabilities = kvm_get_arch_capabilities();
-
vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
/* 22.2.1, 20.8.1 */
@@ -6566,20 +6525,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
return 1;
}
else
- return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
- NULL, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, EMULTYPE_SKIP) ==
+ EMULATE_DONE;
}
- ret = handle_mmio_page_fault(vcpu, gpa, true);
- if (likely(ret == RET_MMIO_PF_EMULATE))
- return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
- EMULATE_DONE;
-
- if (unlikely(ret == RET_MMIO_PF_INVALID))
- return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
-
- if (unlikely(ret == RET_MMIO_PF_RETRY))
- return 1;
+ ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
+ if (ret >= 0)
+ return ret;
/* It is the real ept misconfig */
WARN_ON(1);
@@ -7653,6 +7605,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
gva_t gva = 0;
+ struct x86_exception e;
if (!nested_vmx_check_permission(vcpu) ||
!nested_vmx_check_vmcs12(vcpu))
@@ -7679,8 +7632,12 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
vmx_instruction_info, true, &gva))
return 1;
/* _system ok, as nested_vmx_check_permission verified cpl=0 */
- kvm_write_guest_virt_system(vcpu, gva, &field_value,
- (is_long_mode(vcpu) ? 8 : 4), NULL);
+ if (kvm_write_guest_virt_system(vcpu, gva, &field_value,
+ (is_long_mode(vcpu) ? 8 : 4),
+ &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
}
nested_vmx_succeed(vcpu);
@@ -8050,23 +8007,17 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+/*
+ * Return true if an IO instruction with the specified port and size should cause
+ * a VM-exit into L1.
+ */
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size)
{
- unsigned long exit_qualification;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
gpa_t bitmap, last_bitmap;
- unsigned int port;
- int size;
u8 b;
- if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
- return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- port = exit_qualification >> 16;
- size = (exit_qualification & 7) + 1;
-
last_bitmap = (gpa_t)-1;
b = -1;
@@ -9220,8 +9171,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->__launched = vmx->loaded_vmcs->launched;
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
+ else if (static_branch_unlikely(&mds_user_clear))
+ mds_clear_cpu_buffers();
asm(
/* Store host registers */
@@ -9580,8 +9534,8 @@ free_vcpu:
return ERR_PTR(err);
}
-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
static int vmx_vm_init(struct kvm *kvm)
{
@@ -11356,11 +11310,71 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
}
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned short port;
+ bool intercept;
+ int size;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ port = info->src_val;
+ size = info->dst_bytes;
+ } else {
+ port = info->dst_val;
+ size = info->src_bytes;
+ }
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ intercept = nested_cpu_has(vmcs12,
+ CPU_BASED_UNCOND_IO_EXITING);
+ else
+ intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
static int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
- return X86EMUL_CONTINUE;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+ switch (info->intercept) {
+ /*
+ * RDPID causes #UD if disabled through secondary execution controls.
+ * Because it is marked as EmulateOnUD, we need to intercept it here.
+ */
+ case x86_intercept_rdtscp:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->exception.error_code_valid = false;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ break;
+
+ case x86_intercept_in:
+ case x86_intercept_ins:
+ case x86_intercept_out:
+ case x86_intercept_outs:
+ return vmx_check_intercept_io(vcpu, info);
+
+ /* TODO: check more intercepts... */
+ default:
+ break;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
}
#ifdef CONFIG_X86_64
@@ -11842,7 +11856,7 @@ module_exit(vmx_exit)
static int __init vmx_init(void)
{
- int r;
+ int r, cpu;
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
@@ -11864,6 +11878,12 @@ static int __init vmx_init(void)
}
}
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ }
+
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a29df9ccbfde..314eb954bdee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,7 @@
#include <linux/pvclock_gtod.h>
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
+#include <linux/nospec.h>
#include <trace/events/kvm.h>
#include <asm/debugreg.h>
@@ -191,6 +192,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
+ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits) },
{ NULL }
};
@@ -272,13 +274,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
int err;
- if (((value ^ smsr->values[slot].curr) & mask) == 0)
+ value = (value & mask) | (smsr->values[slot].host & ~mask);
+ if (value == smsr->values[slot].curr)
return 0;
- smsr->values[slot].curr = value;
err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
if (err)
return 1;
+ smsr->values[slot].curr = value;
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
@@ -535,8 +538,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
data, offset, len, access);
}
+static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
+ rsvd_bits(1, 2);
+}
+
/*
- * Load the pae pdptrs. Return true is they are all valid.
+ * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
*/
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
{
@@ -555,8 +564,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & PT_PRESENT_MASK) &&
- (pdpte[i] &
- vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
+ (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
ret = 0;
goto out;
}
@@ -582,7 +590,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
gfn_t gfn;
int r;
- if (is_long_mode(vcpu) || !is_pae(vcpu))
+ if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))
return false;
if (!test_bit(VCPU_EXREG_PDPTR,
@@ -882,9 +890,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- vcpu->arch.db[dr] = val;
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = val;
break;
@@ -921,9 +931,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- *val = vcpu->arch.db[dr];
+ *val = vcpu->arch.db[array_index_nospec(dr, size)];
break;
case 4:
/* fall through */
@@ -1027,6 +1039,14 @@ u64 kvm_get_arch_capabilities(void)
rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
/*
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+ * the nested hypervisor runs with NX huge pages. If it is not,
+ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+ * L1 guests, so it need not worry about its own (L2) guests.
+ */
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
+
+ /*
* If we're doing cache flushes (either "always" or "cond")
* we will do one whenever the guest does a vmlaunch/vmresume.
* If an outer hypervisor is doing the cache flush for us
@@ -1038,8 +1058,40 @@ u64 kvm_get_arch_capabilities(void)
if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ data |= ARCH_CAP_RDCL_NO;
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ data |= ARCH_CAP_SSB_NO;
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ data |= ARCH_CAP_MDS_NO;
+
+ /*
+ * On TAA affected systems, export MDS_NO=0 when:
+ * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
+ * - Updated microcode is present. This is detected by
+ * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
+ * that VERW clears CPU buffers.
+ *
+ * When MDS_NO=0 is exported, guests deploy clear CPU buffer
+ * mitigation and don't complain:
+ *
+ * "Vulnerable: Clear CPU buffers attempted, no microcode"
+ *
+ * If TSX is disabled on the system, guests are also mitigated against
+ * TAA and clear CPU buffer mitigation is not required for guests.
+ */
+ if (!boot_cpu_has(X86_FEATURE_RTM))
+ data &= ~ARCH_CAP_TAA_NO;
+ else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ data |= ARCH_CAP_TAA_NO;
+ else if (data & ARCH_CAP_TSX_CTRL_MSR)
+ data &= ~ARCH_CAP_MDS_NO;
+
+ /* KVM does not emulate MSR_IA32_TSX_CTRL. */
+ data &= ~ARCH_CAP_TSX_CTRL_MSR;
return data;
}
+
EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
@@ -1073,11 +1125,8 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
return 0;
}
-bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (efer & efer_reserved_bits)
- return false;
-
if (efer & EFER_FFXSR) {
struct kvm_cpuid_entry2 *feat;
@@ -1095,19 +1144,33 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
}
return true;
+
+}
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & efer_reserved_bits)
+ return false;
+
+ return __kvm_valid_efer(vcpu, efer);
}
EXPORT_SYMBOL_GPL(kvm_valid_efer);
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 old_efer = vcpu->arch.efer;
+ u64 efer = msr_info->data;
- if (!kvm_valid_efer(vcpu, efer))
+ if (efer & efer_reserved_bits)
return 1;
- if (is_paging(vcpu)
- && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
- return 1;
+ if (!msr_info->host_initiated) {
+ if (!__kvm_valid_efer(vcpu, efer))
+ return 1;
+
+ if (is_paging(vcpu) &&
+ (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
+ }
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
@@ -1354,7 +1417,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
vcpu->arch.tsc_always_catchup = 1;
return 0;
} else {
- WARN(1, "user requested TSC rate below hardware speed\n");
+ pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
return -1;
}
}
@@ -1364,8 +1427,8 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
user_tsc_khz, tsc_khz);
if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
- WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
- user_tsc_khz);
+ pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
return -1;
}
@@ -2067,7 +2130,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
/* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
* workaround a BIOS/GART TBL issue on AMD K8s, ignore
@@ -2197,8 +2263,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (msr_info->host_initiated)
vcpu->arch.microcode_version = data;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.arch_capabilities = data;
+ break;
case MSR_EFER:
- return set_efer(vcpu, data);
+ return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
data &= ~(u64)0x100; /* ignore ignne emulation enable */
@@ -2430,7 +2501,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
data = vcpu->arch.mce_banks[offset];
break;
}
@@ -2473,6 +2547,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_UCODE_REV:
msr_info->data = vcpu->arch.microcode_version;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has_arch_capabilities(vcpu))
+ return 1;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -4598,6 +4678,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
@@ -5735,8 +5822,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
emulation_type))
return EMULATE_DONE;
- if (ctxt->have_exception && inject_emulated_exception(vcpu))
+ if (ctxt->have_exception) {
+ /*
+ * #UD should result in just EMULATION_FAILED, and trap-like
+ * exception should not be encountered during decode.
+ */
+ WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP);
+ inject_emulated_exception(vcpu);
return EMULATE_DONE;
+ }
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
return handle_emulation_failure(vcpu);
@@ -5801,12 +5896,13 @@ restart:
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE && ctxt->tf)
- kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
- exception_type(ctxt->exception.vector) == EXCPT_TRAP)
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_rip_write(vcpu, ctxt->eip);
+ if (r == EMULATE_DONE && ctxt->tf)
+ kvm_vcpu_do_singlestep(vcpu, &r);
__kvm_set_rflags(vcpu, ctxt->eflags);
+ }
/*
* For STI, interrupts are shadowed; so KVM_REQ_EVENT will
@@ -5908,17 +6004,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- if (vcpu->cpu != smp_processor_id())
+ if (vcpu->cpu != raw_smp_processor_id())
send_ipi = 1;
}
}
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) {
/*
@@ -6036,14 +6132,12 @@ static void kvm_set_mmio_spte_mask(void)
/* Set the present bit. */
mask |= 1ull;
-#ifdef CONFIG_X86_64
/*
* If reserved bit is not supported, clear the present bit to disable
* mmio page fault.
*/
if (maxphyaddr == 52)
mask &= ~1ull;
-#endif
kvm_mmu_set_mmio_spte_mask(mask);
}
@@ -6056,12 +6150,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
struct kvm_vcpu *vcpu;
int i;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
atomic_set(&kvm_guest_has_master_clock, 0);
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
}
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -7448,7 +7542,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_update_cpuid(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (!is_long_mode(vcpu) && is_pae(vcpu)) {
+ if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
mmu_reset_needed = 1;
}
@@ -7672,6 +7766,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
int r;
+ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
kvm_vcpu_mtrr_init(vcpu);
r = vcpu_load(vcpu);
if (r)
@@ -7712,7 +7807,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
- kvm_x86_ops->vcpu_free(vcpu);
+ kvm_arch_vcpu_free(vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -8028,6 +8123,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -8056,6 +8152,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return 0;
}
+int kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return kvm_mmu_post_init_vm(kvm);
+}
+
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
int r;
@@ -8162,6 +8263,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
}
EXPORT_SYMBOL_GPL(x86_set_memory_region);
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+ kvm_mmu_pre_destroy_vm(kvm);
+}
+
void kvm_arch_destroy_vm(struct kvm *kvm)
{
if (current->mm == kvm->mm) {
@@ -8213,6 +8319,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
{
int i;
+ /*
+ * Clear out the previous array pointers for the KVM_MR_MOVE case. The
+ * old arrays will be freed by __kvm_set_memory_region() if installing
+ * the new memslot is successful.
+ */
+ memset(&slot->arch, 0, sizeof(slot->arch));
+
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
struct kvm_lpage_info *linfo;
unsigned long ugfn;
@@ -8286,6 +8399,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
+ if (change == KVM_MR_MOVE)
+ return kvm_arch_create_memslot(kvm, memslot,
+ mem->memory_size >> PAGE_SHIFT);
+
return 0;
}
diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c
index 2dd1fe13a37b..19f707992db2 100644
--- a/arch/x86/lib/cpu.c
+++ b/arch/x86/lib/cpu.c
@@ -1,5 +1,6 @@
#include <linux/types.h>
#include <linux/export.h>
+#include <asm/cpu.h>
unsigned int x86_family(unsigned int sig)
{
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 9758524ee99f..71a3759a2d4e 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -112,8 +112,8 @@ static void delay_mwaitx(unsigned long __loops)
__monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
/*
- * AMD, like Intel, supports the EAX hint and EAX=0xf
- * means, do not enter any deep C-state and we use it
+ * AMD, like Intel's MWAIT version, supports the EAX hint and
+ * EAX=0xf0 means, do not enter any deep C-state and we use it
* here in delay() to minimize wakeup latency.
*/
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 1754e094bc28..82e105b284e0 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -333,7 +333,7 @@ AVXcode: 1
06: CLTS
07: SYSRET (o64)
08: INVD
-09: WBINVD
+09: WBINVD | WBNOINVD (F3)
0a:
0b: UD2 (1B)
0c:
@@ -364,7 +364,7 @@ AVXcode: 1
# a ModR/M byte.
1a: BNDCL Gv,Ev (F3) | BNDCU Gv,Ev (F2) | BNDMOV Gv,Ev (66) | BNDLDX Gv,Ev
1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv
-1c:
+1c: Grp20 (1A),(1C)
1d:
1e:
1f: NOP Ev
@@ -792,6 +792,8 @@ f3: Grp17 (1A)
f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
+f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3)
+f9: MOVDIRI My,Gy
EndTable
Table: 3-byte opcode 2 (0x0f 0x3a)
@@ -907,7 +909,7 @@ EndTable
GrpTable: Grp3_2
0: TEST Ev,Iz
-1:
+1: TEST Ev,Iz
2: NOT Ev
3: NEG Ev
4: MUL rAX,Ev
@@ -943,9 +945,9 @@ GrpTable: Grp6
EndTable
GrpTable: Grp7
-0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
-1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B)
-2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
+0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
3: LIDT Ms
4: SMSW Mw/Rv
5: rdpkru (110),(11B) | wrpkru (111),(11B)
@@ -1011,7 +1013,7 @@ GrpTable: Grp15
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
4: XSAVE
5: XRSTOR | lfence (11B)
-6: XSAVEOPT | clwb (66) | mfence (11B)
+6: XSAVEOPT | clwb (66) | mfence (11B) | TPAUSE Rd (66),(11B) | UMONITOR Rv (F3),(11B) | UMWAIT Rd (F2),(11B)
7: clflush | clflushopt (66) | sfence (11B)
EndTable
@@ -1042,6 +1044,10 @@ GrpTable: Grp19
6: vscatterpf1qps/d Wx (66),(ev)
EndTable
+GrpTable: Grp20
+0: cldemote Mb
+EndTable
+
# AMD's Prefetch Group
GrpTable: GrpP
0: PREFETCH
diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h
index afbc4d805d66..df5aee5402c4 100644
--- a/arch/x86/math-emu/fpu_emu.h
+++ b/arch/x86/math-emu/fpu_emu.h
@@ -176,7 +176,7 @@ static inline void reg_copy(FPU_REG const *x, FPU_REG *y)
#define setexponentpos(x,y) { (*(short *)&((x)->exp)) = \
((y) + EXTENDED_Ebias) & 0x7fff; }
#define exponent16(x) (*(short *)&((x)->exp))
-#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (y); }
+#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (u16)(y); }
#define addexponent(x,y) { (*(short *)&((x)->exp)) += (y); }
#define stdexp(x) { (*(short *)&((x)->exp)) += EXTENDED_Ebias; }
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
index 00548354912f..382093c5072b 100644
--- a/arch/x86/math-emu/reg_constant.c
+++ b/arch/x86/math-emu/reg_constant.c
@@ -17,7 +17,7 @@
#include "control_w.h"
#define MAKE_REG(s, e, l, h) { l, h, \
- ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) }
+ (u16)((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) }
FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000);
#if 0
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5c419b8f99a0..102b4e78f4e6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -273,18 +273,19 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
pmd = pmd_offset(pud, address);
pmd_k = pmd_offset(pud_k, address);
- if (!pmd_present(*pmd_k))
- return NULL;
- if (!pmd_present(*pmd))
+ if (pmd_present(*pmd) != pmd_present(*pmd_k))
set_pmd(pmd, *pmd_k);
+
+ if (!pmd_present(*pmd_k))
+ return NULL;
else
- BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+ BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
return pmd_k;
}
-void vmalloc_sync_all(void)
+static void vmalloc_sync(void)
{
unsigned long address;
@@ -299,22 +300,28 @@ void vmalloc_sync_all(void)
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
spinlock_t *pgt_lock;
- pmd_t *ret;
/* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
- ret = vmalloc_sync_one(page_address(page), address);
+ vmalloc_sync_one(page_address(page), address);
spin_unlock(pgt_lock);
-
- if (!ret)
- break;
}
spin_unlock(&pgd_lock);
}
}
+void vmalloc_sync_mappings(void)
+{
+ vmalloc_sync();
+}
+
+void vmalloc_sync_unmappings(void)
+{
+ vmalloc_sync();
+}
+
/*
* 32-bit:
*
@@ -409,11 +416,23 @@ out:
#else /* CONFIG_X86_64: */
-void vmalloc_sync_all(void)
+void vmalloc_sync_mappings(void)
{
+ /*
+ * 64-bit mappings might allocate new p4d/pud pages
+ * that need to be propagated to all tasks' PGDs.
+ */
sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
}
+void vmalloc_sync_unmappings(void)
+{
+ /*
+ * Unmappings never allocate or free p4d/pud pages.
+ * No work is required here.
+ */
+}
+
/*
* 64-bit:
*
@@ -430,8 +449,6 @@ static noinline int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
- WARN_ON_ONCE(in_nmi());
-
/*
* Copy kernel mappings over when needed. This can also
* happen within a race in page table update. In the later
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 1680768d392c..82f727fbbbd2 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -98,6 +98,20 @@ static inline int pte_allows_gup(unsigned long pteval, int write)
}
/*
+ * Return the compund head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+ struct page *head = compound_head(page);
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ return NULL;
+ if (unlikely(!page_cache_add_speculative(head, refs)))
+ return NULL;
+ return head;
+}
+
+/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
* register pressure.
@@ -112,7 +126,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
- struct page *page;
+ struct page *head, *page;
/* Similar to the PMD case, NUMA hinting must take slow path */
if (pte_protnone(pte)) {
@@ -138,7 +152,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
}
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- get_page(page);
+
+ head = try_get_compound_head(page, 1);
+ if (!head) {
+ put_dev_pagemap(pgmap);
+ pte_unmap(ptep);
+ return 0;
+ }
+
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ put_page(head);
+ put_dev_pagemap(pgmap);
+ pte_unmap(ptep);
+ return 0;
+ }
+
put_dev_pagemap(pgmap);
SetPageReferenced(page);
pages[*nr] = page;
@@ -174,9 +202,12 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
undo_dev_pagemap(nr, nr_start, pages);
return 0;
}
+ if (unlikely(!try_get_page(page))) {
+ put_dev_pagemap(pgmap);
+ return 0;
+ }
SetPageReferenced(page);
pages[*nr] = page;
- get_page(page);
put_dev_pagemap(pgmap);
(*nr)++;
pfn++;
@@ -202,6 +233,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
refs = 0;
head = pmd_page(pmd);
+ if (WARN_ON_ONCE(page_ref_count(head) <= 0))
+ return 0;
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
@@ -261,6 +294,8 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
refs = 0;
head = pud_page(pud);
+ if (WARN_ON_ONCE(page_ref_count(head) <= 0))
+ return 0;
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 90801a8f19c9..ce092a62fc5d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -790,7 +790,7 @@ unsigned long max_swapfile_size(void)
pages = generic_max_swapfile_size();
- if (boot_cpu_has_bug(X86_BUG_L1TF)) {
+ if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
/* Limit the swap file size to MAX_PA/2 for L1TF workaround */
unsigned long long l1tf_limit = l1tf_pfn_limit();
/*
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 3f729e20f0e3..12522dbae615 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -9,6 +9,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
+#include <linux/cpu.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -297,7 +298,8 @@ void __init kaiser_check_boottime_disable(void)
goto skip;
}
- if (cmdline_find_option_bool(boot_command_line, "nopti"))
+ if (cmdline_find_option_bool(boot_command_line, "nopti") ||
+ cpu_mitigations_off())
goto disable;
skip:
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e30baa8ad94f..08e0380414a9 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -251,7 +251,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
- pgdp[i] = native_make_pgd(0);
+ pgd_clear(&pgdp[i]);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
@@ -419,7 +419,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed && dirty) {
- *ptep = entry;
+ set_pte(ptep, entry);
pte_update(vma->vm_mm, address, ptep);
}
@@ -436,7 +436,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (changed && dirty) {
- *pmdp = entry;
+ set_pmd(pmdp, entry);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,
@@ -544,8 +544,8 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
fixmaps_set++;
}
-void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
- pgprot_t flags)
+void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
+ phys_addr_t phys, pgprot_t flags)
{
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index eac92e2d171b..a112bb175dd4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,12 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
+/*
+ * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_ibpb.
+ */
+#define LAST_USER_MM_IBPB 0x1UL
+
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
struct flush_tlb_info {
@@ -101,33 +107,101 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
local_irq_restore(flags);
}
+static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+{
+ unsigned long next_tif = task_thread_info(next)->flags;
+ unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+
+ return (unsigned long)next->mm | ibpb;
+}
+
+static void cond_ibpb(struct task_struct *next)
+{
+ if (!next || !next->mm)
+ return;
+
+ /*
+ * Both, the conditional and the always IBPB mode use the mm
+ * pointer to avoid the IBPB when switching between tasks of the
+ * same process. Using the mm pointer instead of mm->context.ctx_id
+ * opens a hypothetical hole vs. mm_struct reuse, which is more or
+ * less impossible to control by an attacker. Aside of that it
+ * would only affect the first schedule so the theoretically
+ * exposed data is not really interesting.
+ */
+ if (static_branch_likely(&switch_mm_cond_ibpb)) {
+ unsigned long prev_mm, next_mm;
+
+ /*
+ * This is a bit more complex than the always mode because
+ * it has to handle two cases:
+ *
+ * 1) Switch from a user space task (potential attacker)
+ * which has TIF_SPEC_IB set to a user space task
+ * (potential victim) which has TIF_SPEC_IB not set.
+ *
+ * 2) Switch from a user space task (potential attacker)
+ * which has TIF_SPEC_IB not set to a user space task
+ * (potential victim) which has TIF_SPEC_IB set.
+ *
+ * This could be done by unconditionally issuing IBPB when
+ * a task which has TIF_SPEC_IB set is either scheduled in
+ * or out. Though that results in two flushes when:
+ *
+ * - the same user space task is scheduled out and later
+ * scheduled in again and only a kernel thread ran in
+ * between.
+ *
+ * - a user space task belonging to the same process is
+ * scheduled in after a kernel thread ran in between
+ *
+ * - a user space task belonging to the same process is
+ * scheduled in immediately.
+ *
+ * Optimize this with reasonably small overhead for the
+ * above cases. Mangle the TIF_SPEC_IB bit into the mm
+ * pointer of the incoming task which is stored in
+ * cpu_tlbstate.last_user_mm_ibpb for comparison.
+ */
+ next_mm = mm_mangle_tif_spec_ib(next);
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
+
+ /*
+ * Issue IBPB only if the mm's are different and one or
+ * both have the IBPB bit set.
+ */
+ if (next_mm != prev_mm &&
+ (next_mm | prev_mm) & LAST_USER_MM_IBPB)
+ indirect_branch_prediction_barrier();
+
+ this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
+ }
+
+ if (static_branch_unlikely(&switch_mm_always_ibpb)) {
+ /*
+ * Only flush when switching to a user space task with a
+ * different context than the user space task which ran
+ * last on this CPU.
+ */
+ if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+ indirect_branch_prediction_barrier();
+ this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
+ }
+ }
+}
+
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
unsigned cpu = smp_processor_id();
if (likely(prev != next)) {
- u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
-
/*
* Avoid user/user BTB poisoning by flushing the branch
* predictor when switching between processes. This stops
* one process from doing Spectre-v2 attacks on another.
- *
- * As an optimization, flush indirect branches only when
- * switching into processes that disable dumping. This
- * protects high value processes like gpg, without having
- * too high performance overhead. IBPB is *expensive*!
- *
- * This will not flush branches when switching into kernel
- * threads. It will also not flush if we switch to idle
- * thread and back to the same process. It will flush if we
- * switch to a different non-dumpable process.
*/
- if (tsk && tsk->mm &&
- tsk->mm->context.ctx_id != last_ctx_id &&
- get_dumpable(tsk->mm) != SUID_DUMP_USER)
- indirect_branch_prediction_barrier();
+ cond_ibpb(tsk);
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -143,14 +217,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
}
- /*
- * Record last user mm's context id, so we can avoid
- * flushing branch buffer with IBPB if we switch back
- * to the same user.
- */
- if (next != &init_mm)
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
-
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
this_cpu_write(cpu_tlbstate.active_mm, next);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index cd9764520851..d9dabd0c31fc 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -15,8 +15,6 @@
#include <asm/nospec-branch.h>
#include <linux/bpf.h>
-int bpf_jit_enable __read_mostly;
-
/*
* assembly code in arch/x86/net/bpf_jit.S
*/
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 20fa7c84109d..62950ef7f84e 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -573,6 +573,17 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar);
/*
+ * Device [1022:7914]
+ * When in D0, PME# doesn't get asserted when plugging USB 2.0 device.
+ */
+static void pci_fixup_amd_fch_xhci_pme(struct pci_dev *dev)
+{
+ dev_info(&dev->dev, "PME# does not work under D0, disabling it\n");
+ dev->pme_support &= ~(PCI_PM_CAP_PME_D0 >> PCI_PM_CAP_PME_SHIFT);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7914, pci_fixup_amd_fch_xhci_pme);
+
+/*
* Apple MacBook Pro: Avoid [mem 0x7fa00000-0x7fbfffff]
*
* Using the [mem 0x7fa00000-0x7fbfffff] region, e.g., by assigning it to
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9bd115484745..5f0e596b0519 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1117,6 +1117,8 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
void __init pcibios_irq_init(void)
{
+ struct irq_routing_table *rtable = NULL;
+
DBG(KERN_DEBUG "PCI: IRQ init\n");
if (raw_pci_ops == NULL)
@@ -1127,8 +1129,10 @@ void __init pcibios_irq_init(void)
pirq_table = pirq_find_routing_table();
#ifdef CONFIG_PCI_BIOS
- if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
+ if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) {
pirq_table = pcibios_get_irq_routing_table();
+ rtable = pirq_table;
+ }
#endif
if (pirq_table) {
pirq_peer_trick();
@@ -1143,8 +1147,10 @@ void __init pcibios_irq_init(void)
* If we're using the I/O APIC, avoid using the PCI IRQ
* routing table
*/
- if (io_apic_assign_pci_irqs)
+ if (io_apic_assign_pci_irqs) {
+ kfree(rtable);
pirq_table = NULL;
+ }
}
x86_init.pci.fixup_irqs();
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
index d49d3be81953..ecb5866aaf84 100644
--- a/arch/x86/platform/atom/punit_atom_debug.c
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -154,8 +154,8 @@ static void punit_dbgfs_unregister(void)
(kernel_ulong_t)&drv_data }
static const struct x86_cpu_id intel_punit_cpu_ids[] = {
- ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt),
- ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng),
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt),
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng),
ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht),
{}
};
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index a0e85f2aff7d..f08abdf8bb67 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -478,7 +478,6 @@ void __init efi_init(void)
efi_char16_t *c16;
char vendor[100] = "unknown";
int i = 0;
- void *tmp;
#ifdef CONFIG_X86_32
if (boot_params.efi_info.efi_systab_hi ||
@@ -503,14 +502,16 @@ void __init efi_init(void)
/*
* Show what we know for posterity
*/
- c16 = tmp = early_memremap(efi.systab->fw_vendor, 2);
+ c16 = early_memremap_ro(efi.systab->fw_vendor,
+ sizeof(vendor) * sizeof(efi_char16_t));
if (c16) {
- for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
- vendor[i] = *c16++;
+ for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i)
+ vendor[i] = c16[i];
vendor[i] = '\0';
- } else
+ early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t));
+ } else {
pr_err("Could not map the firmware vendor!\n");
- early_memunmap(tmp, 2);
+ }
pr_info("EFI v%u.%.02u by %s\n",
efi.systab->hdr.revision >> 16,
@@ -896,9 +897,6 @@ static void __init kexec_enter_virtual_mode(void)
if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
runtime_code_page_mkexec();
-
- /* clean DUMMY object */
- efi_delete_dummy_variable();
#endif
}
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 53cace2ec0e2..c8f947a4aaf2 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -13,6 +13,7 @@
#include <linux/smp.h>
#include <linux/perf_event.h>
#include <linux/tboot.h>
+#include <linux/dmi.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
@@ -24,7 +25,7 @@
#include <asm/debugreg.h>
#include <asm/cpu.h>
#include <asm/mmu_context.h>
-#include <linux/dmi.h>
+#include <asm/cpu_device_id.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -82,12 +83,8 @@ static void __save_processor_state(struct saved_context *ctxt)
/*
* descriptor tables
*/
-#ifdef CONFIG_X86_32
store_idt(&ctxt->idt);
-#else
-/* CONFIG_X86_64 */
- store_idt((struct desc_ptr *)&ctxt->idt_limit);
-#endif
+
/*
* We save it here, but restore it only in the hibernate case.
* For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit
@@ -103,22 +100,18 @@ static void __save_processor_state(struct saved_context *ctxt)
/*
* segment registers
*/
-#ifdef CONFIG_X86_32
- savesegment(es, ctxt->es);
- savesegment(fs, ctxt->fs);
+#ifdef CONFIG_X86_32_LAZY_GS
savesegment(gs, ctxt->gs);
- savesegment(ss, ctxt->ss);
-#else
-/* CONFIG_X86_64 */
- asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
- asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
- asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
- asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
- asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
+#endif
+#ifdef CONFIG_X86_64
+ savesegment(gs, ctxt->gs);
+ savesegment(fs, ctxt->fs);
+ savesegment(ds, ctxt->ds);
+ savesegment(es, ctxt->es);
rdmsrl(MSR_FS_BASE, ctxt->fs_base);
- rdmsrl(MSR_GS_BASE, ctxt->gs_base);
- rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+ rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base);
+ rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base);
mtrr_save_fixed_ranges(NULL);
rdmsrl(MSR_EFER, ctxt->efer);
@@ -178,6 +171,9 @@ static void fix_processor_context(void)
write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
syscall_init(); /* This sets MSR_*STAR and related */
+#else
+ if (boot_cpu_has(X86_FEATURE_SEP))
+ enable_sep_cpu();
#endif
load_TR_desc(); /* This does ltr */
load_mm_ldt(current->active_mm); /* This does lldt */
@@ -186,9 +182,12 @@ static void fix_processor_context(void)
}
/**
- * __restore_processor_state - restore the contents of CPU registers saved
- * by __save_processor_state()
- * @ctxt - structure to load the registers contents from
+ * __restore_processor_state - restore the contents of CPU registers saved
+ * by __save_processor_state()
+ * @ctxt - structure to load the registers contents from
+ *
+ * The asm code that gets us here will have restored a usable GDT, although
+ * it will be pointing to the wrong alias.
*/
static void notrace __restore_processor_state(struct saved_context *ctxt)
{
@@ -211,46 +210,52 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
write_cr2(ctxt->cr2);
write_cr0(ctxt->cr0);
+ /* Restore the IDT. */
+ load_idt(&ctxt->idt);
+
/*
- * now restore the descriptor tables to their proper values
- * ltr is done i fix_processor_context().
+ * Just in case the asm code got us here with the SS, DS, or ES
+ * out of sync with the GDT, update them.
*/
-#ifdef CONFIG_X86_32
- load_idt(&ctxt->idt);
+ loadsegment(ss, __KERNEL_DS);
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+
+ /*
+ * Restore percpu access. Percpu access can happen in exception
+ * handlers or in complicated helpers like load_gs_index().
+ */
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base);
#else
-/* CONFIG_X86_64 */
- load_idt((const struct desc_ptr *)&ctxt->idt_limit);
+ loadsegment(fs, __KERNEL_PERCPU);
+ loadsegment(gs, __KERNEL_STACK_CANARY);
#endif
+ /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */
+ fix_processor_context();
+
/*
- * segment registers
+ * Now that we have descriptor tables fully restored and working
+ * exception handling, restore the usermode segments.
*/
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_64
+ loadsegment(ds, ctxt->es);
loadsegment(es, ctxt->es);
loadsegment(fs, ctxt->fs);
- loadsegment(gs, ctxt->gs);
- loadsegment(ss, ctxt->ss);
+ load_gs_index(ctxt->gs);
/*
- * sysenter MSRs
+ * Restore FSBASE and GSBASE after restoring the selectors, since
+ * restoring the selectors clobbers the bases. Keep in mind
+ * that MSR_KERNEL_GS_BASE is horribly misnamed.
*/
- if (boot_cpu_has(X86_FEATURE_SEP))
- enable_sep_cpu();
-#else
-/* CONFIG_X86_64 */
- asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
- asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
- asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
- load_gs_index(ctxt->gs);
- asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
-
wrmsrl(MSR_FS_BASE, ctxt->fs_base);
- wrmsrl(MSR_GS_BASE, ctxt->gs_base);
- wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base);
+#elif defined(CONFIG_X86_32_LAZY_GS)
+ loadsegment(gs, ctxt->gs);
#endif
- fix_processor_context();
-
do_fpu_end();
x86_platform.restore_sched_clock_state();
mtrr_bp_restore();
@@ -288,7 +293,17 @@ int hibernate_resume_nonboot_cpu_disable(void)
* address in its instruction pointer may not be possible to resolve
* any more at that point (the page tables used by it previously may
* have been overwritten by hibernate image data).
+ *
+ * First, make sure that we wake up all the potentially disabled SMT
+ * threads which have been initially brought up and then put into
+ * mwait/cpuidle sleep.
+ * Those will be put to proper (not interfering with hibernation
+ * resume) sleep afterwards, and the resumed kernel will decide itself
+ * what to do with them.
*/
+ ret = cpuhp_smt_enable();
+ if (ret)
+ return ret;
smp_ops.play_dead = resume_play_dead;
ret = disable_nonboot_cpus();
smp_ops.play_dead = play_dead;
@@ -377,15 +392,14 @@ static int __init bsp_pm_check_init(void)
core_initcall(bsp_pm_check_init);
-static int msr_init_context(const u32 *msr_id, const int total_num)
+static int msr_build_context(const u32 *msr_id, const int num)
{
- int i = 0;
+ struct saved_msrs *saved_msrs = &saved_context.saved_msrs;
struct saved_msr *msr_array;
+ int total_num;
+ int i, j;
- if (saved_context.saved_msrs.array || saved_context.saved_msrs.num > 0) {
- pr_err("x86/pm: MSR quirk already applied, please check your DMI match table.\n");
- return -EINVAL;
- }
+ total_num = saved_msrs->num + num;
msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL);
if (!msr_array) {
@@ -393,19 +407,30 @@ static int msr_init_context(const u32 *msr_id, const int total_num)
return -ENOMEM;
}
- for (i = 0; i < total_num; i++) {
- msr_array[i].info.msr_no = msr_id[i];
+ if (saved_msrs->array) {
+ /*
+ * Multiple callbacks can invoke this function, so copy any
+ * MSR save requests from previous invocations.
+ */
+ memcpy(msr_array, saved_msrs->array,
+ sizeof(struct saved_msr) * saved_msrs->num);
+
+ kfree(saved_msrs->array);
+ }
+
+ for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) {
+ msr_array[i].info.msr_no = msr_id[j];
msr_array[i].valid = false;
msr_array[i].info.reg.q = 0;
}
- saved_context.saved_msrs.num = total_num;
- saved_context.saved_msrs.array = msr_array;
+ saved_msrs->num = total_num;
+ saved_msrs->array = msr_array;
return 0;
}
/*
- * The following section is a quirk framework for problematic BIOSen:
+ * The following sections are a quirk framework for problematic BIOSen:
* Sometimes MSRs are modified by the BIOSen after suspended to
* RAM, this might cause unexpected behavior after wakeup.
* Thus we save/restore these specified MSRs across suspend/resume
@@ -420,7 +445,7 @@ static int msr_initialize_bdw(const struct dmi_system_id *d)
u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL };
pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident);
- return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
+ return msr_build_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
}
static struct dmi_system_id msr_save_dmi_table[] = {
@@ -435,9 +460,58 @@ static struct dmi_system_id msr_save_dmi_table[] = {
{}
};
+static int msr_save_cpuid_features(const struct x86_cpu_id *c)
+{
+ u32 cpuid_msr_id[] = {
+ MSR_AMD64_CPUID_FN_1,
+ };
+
+ pr_info("x86/pm: family %#hx cpu detected, MSR saving is needed during suspending.\n",
+ c->family);
+
+ return msr_build_context(cpuid_msr_id, ARRAY_SIZE(cpuid_msr_id));
+}
+
+static const struct x86_cpu_id msr_save_cpu_table[] = {
+ {
+ .vendor = X86_VENDOR_AMD,
+ .family = 0x15,
+ .model = X86_MODEL_ANY,
+ .feature = X86_FEATURE_ANY,
+ .driver_data = (kernel_ulong_t)msr_save_cpuid_features,
+ },
+ {
+ .vendor = X86_VENDOR_AMD,
+ .family = 0x16,
+ .model = X86_MODEL_ANY,
+ .feature = X86_FEATURE_ANY,
+ .driver_data = (kernel_ulong_t)msr_save_cpuid_features,
+ },
+ {}
+};
+
+typedef int (*pm_cpu_match_t)(const struct x86_cpu_id *);
+static int pm_cpu_check(const struct x86_cpu_id *c)
+{
+ const struct x86_cpu_id *m;
+ int ret = 0;
+
+ m = x86_match_cpu(msr_save_cpu_table);
+ if (m) {
+ pm_cpu_match_t fn;
+
+ fn = (pm_cpu_match_t)m->driver_data;
+ ret = fn(m);
+ }
+
+ return ret;
+}
+
static int pm_check_save_msr(void)
{
dmi_check_system(msr_save_dmi_table);
+ pm_cpu_check(msr_save_cpu_table);
+
return 0;
}
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index fef485b789ca..6120046bb7dd 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,7 @@
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/suspend.h>
+#include <linux/cpu.h>
#include <asm/init.h>
#include <asm/proto.h>
@@ -218,3 +219,35 @@ int arch_hibernation_header_restore(void *addr)
restore_cr3 = rdr->cr3;
return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL;
}
+
+int arch_resume_nosmt(void)
+{
+ int ret = 0;
+ /*
+ * We reached this while coming out of hibernation. This means
+ * that SMT siblings are sleeping in hlt, as mwait is not safe
+ * against control transition during resume (see comment in
+ * hibernate_resume_nonboot_cpu_disable()).
+ *
+ * If the resumed kernel has SMT disabled, we have to take all the
+ * SMT siblings out of hlt, and offline them again so that they
+ * end up in mwait proper.
+ *
+ * Called with hotplug disabled.
+ */
+ cpu_hotplug_enable();
+ if (cpu_smt_control == CPU_SMT_DISABLED ||
+ cpu_smt_control == CPU_SMT_FORCE_DISABLED) {
+ enum cpuhp_smt_control old = cpu_smt_control;
+
+ ret = cpuhp_smt_enable();
+ if (ret)
+ goto out;
+ ret = cpuhp_smt_disable(old);
+ if (ret)
+ goto out;
+ }
+out:
+ cpu_hotplug_disable();
+ return ret;
+}
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 25012abc3409..ce5f431e6823 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -47,7 +47,7 @@ $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
targets += realmode.lds
$(obj)/realmode.lds: $(obj)/pasyms.h
-LDFLAGS_realmode.elf := --emit-relocs -T
+LDFLAGS_realmode.elf := -m elf_i386 --emit-relocs -T
CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj)
targets += realmode.elf
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index a3d2c62fd805..0a3ad5dd1e8b 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -68,7 +68,7 @@ BEGIN {
lprefix1_expr = "\\((66|!F3)\\)"
lprefix2_expr = "\\(F3\\)"
- lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)"
+ lprefix3_expr = "\\((F2|!F3|66&F2)\\)"
lprefix_expr = "\\((66|F2|F3)\\)"
max_lprefix = 4
@@ -256,7 +256,7 @@ function convert_operands(count,opnd, i,j,imm,mod)
return add_flags(imm, mod)
}
-/^[0-9a-f]+\:/ {
+/^[0-9a-f]+:/ {
if (NR == 1)
next
# get index
diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c
index 00f54a91bb4b..3c423dfcd78b 100644
--- a/arch/x86/um/os-Linux/registers.c
+++ b/arch/x86/um/os-Linux/registers.c
@@ -5,6 +5,7 @@
*/
#include <errno.h>
+#include <stdlib.h>
#include <sys/ptrace.h>
#ifdef __i386__
#include <sys/user.h>
@@ -26,17 +27,18 @@ int save_i387_registers(int pid, unsigned long *fp_regs)
int save_fp_registers(int pid, unsigned long *fp_regs)
{
+#ifdef PTRACE_GETREGSET
struct iovec iov;
if (have_xstate_support) {
iov.iov_base = fp_regs;
- iov.iov_len = sizeof(struct _xstate);
+ iov.iov_len = FP_SIZE * sizeof(unsigned long);
if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0)
return -errno;
return 0;
- } else {
+ } else
+#endif
return save_i387_registers(pid, fp_regs);
- }
}
int restore_i387_registers(int pid, unsigned long *fp_regs)
@@ -48,17 +50,17 @@ int restore_i387_registers(int pid, unsigned long *fp_regs)
int restore_fp_registers(int pid, unsigned long *fp_regs)
{
+#ifdef PTRACE_SETREGSET
struct iovec iov;
-
if (have_xstate_support) {
iov.iov_base = fp_regs;
- iov.iov_len = sizeof(struct _xstate);
+ iov.iov_len = FP_SIZE * sizeof(unsigned long);
if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0)
return -errno;
return 0;
- } else {
+ } else
+#endif
return restore_i387_registers(pid, fp_regs);
- }
}
#ifdef __i386__
@@ -122,13 +124,21 @@ int put_fp_registers(int pid, unsigned long *regs)
void arch_init_registers(int pid)
{
- struct _xstate fp_regs;
+#ifdef PTRACE_GETREGSET
+ void * fp_regs;
struct iovec iov;
- iov.iov_base = &fp_regs;
- iov.iov_len = sizeof(struct _xstate);
+ fp_regs = malloc(FP_SIZE * sizeof(unsigned long));
+ if(fp_regs == NULL)
+ return;
+
+ iov.iov_base = fp_regs;
+ iov.iov_len = FP_SIZE * sizeof(unsigned long);
if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) == 0)
have_xstate_support = 1;
+
+ free(fp_regs);
+#endif
}
#endif
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index cb3c22370cf5..7bcd10614f8b 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -50,7 +50,11 @@ void foo(void)
DEFINE(HOST_GS, GS);
DEFINE(HOST_ORIG_AX, ORIG_EAX);
#else
- DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long));
+#ifdef FP_XSTATE_MAGIC1
+ DEFINE_LONGS(HOST_FP_SIZE, 2696);
+#else
+ DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long));
+#endif
DEFINE_LONGS(HOST_BX, RBX);
DEFINE_LONGS(HOST_CX, RCX);
DEFINE_LONGS(HOST_DI, RDI);