summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/events/core.c17
-rw-r--r--arch/x86/events/intel/core.c9
-rw-r--r--arch/x86/events/intel/pt.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c4
-rw-r--r--arch/x86/hyperv/hv_init.c3
-rw-r--r--arch/x86/include/asm/compat.h6
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/fpu/internal.h13
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/nospec-branch.h16
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/include/asm/realmode.h1
-rw-r--r--arch/x86/kernel/acpi/boot.c24
-rw-r--r--arch/x86/kernel/cpu/bugs.c216
-rw-r--r--arch/x86/kernel/cpu/mce/core.c31
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c6
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/early-quirks.c10
-rw-r--r--arch/x86/kernel/kvm.c15
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c6
-rw-r--r--arch/x86/kernel/reboot.c12
-rw-r--r--arch/x86/kernel/tsc.c28
-rw-r--r--arch/x86/kernel/tsc_sync.c41
-rw-r--r--arch/x86/kvm/cpuid.c5
-rw-r--r--arch/x86/kvm/emulate.c14
-rw-r--r--arch/x86/kvm/hyperv.c23
-rw-r--r--arch/x86/kvm/lapic.c8
-rw-r--r--arch/x86/kvm/paging_tmpl.h77
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/pmu_amd.c10
-rw-r--r--arch/x86/kvm/svm.c5
-rw-r--r--arch/x86/kvm/vmx/evmcs.h4
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/platform/efi/quirks.c3
-rw-r--r--arch/x86/power/cpu.c29
-rw-r--r--arch/x86/realmode/init.c38
-rw-r--r--arch/x86/um/syscalls_64.c3
-rw-r--r--arch/x86/xen/pmu.c10
-rw-r--r--arch/x86/xen/pmu.h3
-rw-r--r--arch/x86/xen/smp_hvm.c6
-rw-r--r--arch/x86/xen/smp_pv.c2
-rw-r--r--arch/x86/xen/time.c24
47 files changed, 547 insertions, 205 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c2a3ec3dd850..c6c71592f6e4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1990,6 +1990,7 @@ config EFI
depends on ACPI
select UCS2_STRING
select EFI_RUNTIME_WRAPPERS
+ select ARCH_USE_MEMREMAP_PROT
---help---
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 00bccb4d1772..e4f7ac28dcf2 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2366,10 +2366,11 @@ static bool perf_hw_regs(struct pt_regs *regs)
void
perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
{
+ struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
struct unwind_state state;
unsigned long addr;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (guest_cbs && guest_cbs->is_in_guest()) {
/* TODO: We don't support guest os callchain now */
return;
}
@@ -2475,10 +2476,11 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
void
perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
{
+ struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
struct stack_frame frame;
const unsigned long __user *fp;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (guest_cbs && guest_cbs->is_in_guest()) {
/* TODO: We don't support guest os callchain now */
return;
}
@@ -2562,18 +2564,21 @@ static unsigned long code_segment_base(struct pt_regs *regs)
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
- return perf_guest_cbs->get_guest_ip();
+ struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
+
+ if (guest_cbs && guest_cbs->is_in_guest())
+ return guest_cbs->get_guest_ip();
return regs->ip + code_segment_base(regs);
}
unsigned long perf_misc_flags(struct pt_regs *regs)
{
+ struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
int misc = 0;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- if (perf_guest_cbs->is_user_mode())
+ if (guest_cbs && guest_cbs->is_in_guest()) {
+ if (guest_cbs->is_user_mode())
misc |= PERF_RECORD_MISC_GUEST_USER;
else
misc |= PERF_RECORD_MISC_GUEST_KERNEL;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 70758f99c9e4..b33540e1efa8 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2333,6 +2333,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
{
struct perf_sample_data data;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_guest_info_callbacks *guest_cbs;
int bit;
int handled = 0;
@@ -2386,9 +2387,11 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
*/
if (__test_and_clear_bit(55, (unsigned long *)&status)) {
handled++;
- if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() &&
- perf_guest_cbs->handle_intel_pt_intr))
- perf_guest_cbs->handle_intel_pt_intr();
+
+ guest_cbs = perf_get_guest_cbs();
+ if (unlikely(guest_cbs && guest_cbs->is_in_guest() &&
+ guest_cbs->handle_intel_pt_intr))
+ guest_cbs->handle_intel_pt_intr();
else
intel_pt_interrupt();
}
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index da289a44d511..f4d2322f4c62 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -460,7 +460,7 @@ static u64 pt_config_filters(struct perf_event *event)
pt->filters.filter[range].msr_b = filter->msr_b;
}
- rtit_ctl |= filter->config << pt_address_ranges[range].reg_off;
+ rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off;
}
return rtit_ctl;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 9096a1693942..0f61f46e6086 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3479,6 +3479,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev
struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
struct extra_reg *er;
int idx = 0;
+ /* Any of the CHA events may be filtered by Thread/Core-ID.*/
+ if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN)
+ idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID;
for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
if (er->event != (event->hw.config & er->config_mask))
@@ -3546,6 +3549,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
EVENT_CONSTRAINT_END
};
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 812db1ac8cb1..df4a4a9dc1e8 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -163,6 +163,9 @@ void set_hv_tscchange_cb(void (*cb)(void))
return;
}
+ if (!hv_vp_index)
+ return;
+
hv_reenlightenment_cb = cb;
/* Make sure callback is registered before we write to MSRs */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 22c4dfe65992..b4dd6ab0fdfc 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -31,15 +31,13 @@ typedef s64 __attribute__((aligned(4))) compat_s64;
typedef u64 __attribute__((aligned(4))) compat_u64;
struct compat_stat {
- compat_dev_t st_dev;
- u16 __pad1;
+ u32 st_dev;
compat_ino_t st_ino;
compat_mode_t st_mode;
compat_nlink_t st_nlink;
__compat_uid_t st_uid;
__compat_gid_t st_gid;
- compat_dev_t st_rdev;
- u16 __pad2;
+ u32 st_rdev;
u32 st_size;
u32 st_blksize;
u32 st_blocks;
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d912457f56a7..56eb9a6524e9 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -96,6 +96,7 @@
#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
+#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
@@ -107,6 +108,7 @@
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
+/* free ( 3*32+29) */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
@@ -202,7 +204,7 @@
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
-#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
+#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */
#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 03b3de491b5e..5ed702e2c55f 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -560,9 +560,11 @@ static inline void __fpregs_load_activate(void)
* The FPU context is only stored/restored for a user task and
* PF_KTHREAD is used to distinguish between kernel and user threads.
*/
-static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
+static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
{
- if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
+ struct fpu *old_fpu = &prev->thread.fpu;
+
+ if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
if (!copy_fpregs_to_fpstate(old_fpu))
old_fpu->last_cpu = -1;
else
@@ -581,10 +583,11 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
* Load PKRU from the FPU context if available. Delay loading of the
* complete FPU state until the return to userland.
*/
-static inline void switch_fpu_finish(struct fpu *new_fpu)
+static inline void switch_fpu_finish(struct task_struct *next)
{
u32 pkru_val = init_pkru_value;
struct pkru_state *pk;
+ struct fpu *next_fpu = &next->thread.fpu;
if (!static_cpu_has(X86_FEATURE_FPU))
return;
@@ -598,7 +601,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
* PKRU state is switched eagerly because it needs to be valid before we
* return to userland e.g. for a copy_to_user() operation.
*/
- if (!(current->flags & PF_KTHREAD)) {
+ if (!(next->flags & PF_KTHREAD)) {
/*
* If the PKRU bit in xsave.header.xfeatures is not set,
* then the PKRU component was in init state, which means
@@ -607,7 +610,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
* in memory is not valid. This means pkru_val has to be
* set to 0 and not to init_pkru_value.
*/
- pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
+ pk = get_xsave_addr(&next_fpu->state.xsave, XFEATURE_PKRU);
pkru_val = pk ? pk->pkru : 0;
}
__write_pkru(pkru_val);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 2b7cc5397f80..91a06cef50c1 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -133,11 +133,13 @@ extern void load_ucode_ap(void);
void reload_early_microcode(void);
extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
extern bool initrd_gone;
+void microcode_bsp_resume(void);
#else
static inline int __init microcode_init(void) { return 0; };
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void reload_early_microcode(void) { }
+static inline void microcode_bsp_resume(void) { }
static inline bool
get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; }
#endif
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index b222a3595946..956df82bbc2b 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -115,7 +115,7 @@
ANNOTATE_NOSPEC_ALTERNATIVE
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \
__stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_LFENCE
#else
jmp *\reg
#endif
@@ -126,7 +126,7 @@
ANNOTATE_NOSPEC_ALTERNATIVE
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \
__stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_LFENCE
#else
call *\reg
#endif
@@ -171,7 +171,7 @@
"lfence;\n" \
ANNOTATE_RETPOLINE_SAFE \
"call *%[thunk_target]\n", \
- X86_FEATURE_RETPOLINE_AMD)
+ X86_FEATURE_RETPOLINE_LFENCE)
# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
#else /* CONFIG_X86_32 */
@@ -201,7 +201,7 @@
"lfence;\n" \
ANNOTATE_RETPOLINE_SAFE \
"call *%[thunk_target]\n", \
- X86_FEATURE_RETPOLINE_AMD)
+ X86_FEATURE_RETPOLINE_LFENCE)
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif
@@ -213,9 +213,11 @@
/* The Spectre V2 mitigation variants */
enum spectre_v2_mitigation {
SPECTRE_V2_NONE,
- SPECTRE_V2_RETPOLINE_GENERIC,
- SPECTRE_V2_RETPOLINE_AMD,
- SPECTRE_V2_IBRS_ENHANCED,
+ SPECTRE_V2_RETPOLINE,
+ SPECTRE_V2_LFENCE,
+ SPECTRE_V2_EIBRS,
+ SPECTRE_V2_EIBRS_RETPOLINE,
+ SPECTRE_V2_EIBRS_LFENCE,
};
/* The indirect branch speculation control variants */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ea85f23d9e22..1ae7c20f5469 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1375,8 +1375,8 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
#endif
#endif
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 09ecc32f6524..52d7512ea91a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -82,6 +82,7 @@ static inline void set_real_mode_mem(phys_addr_t mem)
}
void reserve_real_mode(void);
+void load_trampoline_pgtable(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4137a7342d68..7b75658b7e9a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1339,6 +1339,17 @@ static int __init disable_acpi_pci(const struct dmi_system_id *d)
return 0;
}
+static int __init disable_acpi_xsdt(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ pr_notice("%s detected: force use of acpi=rsdt\n", d->ident);
+ acpi_gbl_do_not_use_xsdt = TRUE;
+ } else {
+ pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n");
+ }
+ return 0;
+}
+
static int __init dmi_disable_acpi(const struct dmi_system_id *d)
{
if (!acpi_force) {
@@ -1463,6 +1474,19 @@ static const struct dmi_system_id acpi_dmi_table[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
},
},
+ /*
+ * Boxes that need ACPI XSDT use disabled due to corrupted tables
+ */
+ {
+ .callback = disable_acpi_xsdt,
+ .ident = "Advantech DAC-BJ01",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NEC"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"),
+ DMI_MATCH(DMI_BIOS_VERSION, "V1.12"),
+ DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"),
+ },
+ },
{}
};
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index fcc4238ee95f..e817aaeef254 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -31,6 +31,7 @@
#include <asm/intel-family.h>
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
+#include <linux/bpf.h>
#include "cpu.h"
@@ -607,6 +608,32 @@ static inline const char *spectre_v2_module_string(void)
static inline const char *spectre_v2_module_string(void) { return ""; }
#endif
+#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n"
+#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n"
+
+#ifdef CONFIG_BPF_SYSCALL
+void unpriv_ebpf_notify(int new_state)
+{
+ if (new_state)
+ return;
+
+ /* Unprivileged eBPF is enabled */
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_EIBRS:
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+ break;
+ case SPECTRE_V2_EIBRS_LFENCE:
+ if (sched_smt_active())
+ pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+ break;
+ default:
+ break;
+ }
+}
+#endif
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt);
@@ -621,7 +648,10 @@ enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_FORCE,
SPECTRE_V2_CMD_RETPOLINE,
SPECTRE_V2_CMD_RETPOLINE_GENERIC,
- SPECTRE_V2_CMD_RETPOLINE_AMD,
+ SPECTRE_V2_CMD_RETPOLINE_LFENCE,
+ SPECTRE_V2_CMD_EIBRS,
+ SPECTRE_V2_CMD_EIBRS_RETPOLINE,
+ SPECTRE_V2_CMD_EIBRS_LFENCE,
};
enum spectre_v2_user_cmd {
@@ -694,6 +724,13 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
return SPECTRE_V2_USER_CMD_AUTO;
}
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return (mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE);
+}
+
static void __init
spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
{
@@ -756,10 +793,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
}
/*
- * If enhanced IBRS is enabled or SMT impossible, STIBP is not
+ * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not
* required.
*/
- if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+ !smt_possible ||
+ spectre_v2_in_eibrs_mode(spectre_v2_enabled))
return;
/*
@@ -771,12 +810,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
mode = SPECTRE_V2_USER_STRICT_PREFERRED;
- /*
- * If STIBP is not available, clear the STIBP mode.
- */
- if (!boot_cpu_has(X86_FEATURE_STIBP))
- mode = SPECTRE_V2_USER_NONE;
-
spectre_v2_user_stibp = mode;
set_mode:
@@ -785,9 +818,11 @@ set_mode:
static const char * const spectre_v2_strings[] = {
[SPECTRE_V2_NONE] = "Vulnerable",
- [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
- [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
- [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
+ [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
+ [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines",
};
static const struct {
@@ -798,8 +833,12 @@ static const struct {
{ "off", SPECTRE_V2_CMD_NONE, false },
{ "on", SPECTRE_V2_CMD_FORCE, true },
{ "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
- { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
+ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false },
+ { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false },
{ "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+ { "eibrs", SPECTRE_V2_CMD_EIBRS, false },
+ { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false },
+ { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false },
{ "auto", SPECTRE_V2_CMD_AUTO, false },
};
@@ -836,17 +875,30 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
- cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
- cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
+ cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
!IS_ENABLED(CONFIG_RETPOLINE)) {
- pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option);
+ pr_err("%s selected but not compiled in. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((cmd == SPECTRE_V2_CMD_EIBRS ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n",
+ mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_HYGON &&
- boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
+ if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) &&
+ !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+ pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n",
+ mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
@@ -855,6 +907,16 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return cmd;
}
+static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
+{
+ if (!IS_ENABLED(CONFIG_RETPOLINE)) {
+ pr_err("Kernel not compiled with retpoline; no mitigation available!");
+ return SPECTRE_V2_NONE;
+ }
+
+ return SPECTRE_V2_RETPOLINE;
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -875,49 +937,64 @@ static void __init spectre_v2_select_mitigation(void)
case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
- mode = SPECTRE_V2_IBRS_ENHANCED;
- /* Force it so VMEXIT will restore correctly */
- x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
- goto specv2_set_mode;
+ mode = SPECTRE_V2_EIBRS;
+ break;
}
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_auto;
+
+ mode = spectre_v2_select_retpoline();
break;
- case SPECTRE_V2_CMD_RETPOLINE_AMD:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_amd;
+
+ case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
+ pr_err(SPECTRE_V2_LFENCE_MSG);
+ mode = SPECTRE_V2_LFENCE;
break;
+
case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_generic;
+ mode = SPECTRE_V2_RETPOLINE;
break;
+
case SPECTRE_V2_CMD_RETPOLINE:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_auto;
+ mode = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS:
+ mode = SPECTRE_V2_EIBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_LFENCE:
+ mode = SPECTRE_V2_EIBRS_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
+ mode = SPECTRE_V2_EIBRS_RETPOLINE;
break;
}
- pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!");
- return;
-retpoline_auto:
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
- boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- retpoline_amd:
- if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
- pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
- goto retpoline_generic;
- }
- mode = SPECTRE_V2_RETPOLINE_AMD;
- setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
- setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
- } else {
- retpoline_generic:
- mode = SPECTRE_V2_RETPOLINE_GENERIC;
+ if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+
+ if (spectre_v2_in_eibrs_mode(mode)) {
+ /* Force it so VMEXIT will restore correctly */
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ }
+
+ switch (mode) {
+ case SPECTRE_V2_NONE:
+ case SPECTRE_V2_EIBRS:
+ break;
+
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
+ fallthrough;
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ break;
}
-specv2_set_mode:
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
@@ -943,7 +1020,7 @@ specv2_set_mode:
* the CPU supports Enhanced IBRS, kernel might un-intentionally not
* enable IBRS around firmware calls.
*/
- if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) {
+ if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
@@ -1013,6 +1090,10 @@ void cpu_bugs_smt_update(void)
{
mutex_lock(&spec_ctrl_mutex);
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+
switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
break;
@@ -1267,7 +1348,6 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return 0;
-
/*
* With strict mode for both IBPB and STIBP, the instruction
* code paths avoid checking this task flag and instead,
@@ -1614,7 +1694,7 @@ static ssize_t tsx_async_abort_show_state(char *buf)
static char *stibp_state(void)
{
- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
return "";
switch (spectre_v2_user_stibp) {
@@ -1644,6 +1724,27 @@ static char *ibpb_state(void)
return "";
}
+static ssize_t spectre_v2_show_state(char *buf)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+ return sprintf(buf, "Vulnerable: LFENCE\n");
+
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+
+ return sprintf(buf, "%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ spectre_v2_module_string());
+}
+
static ssize_t srbds_show_state(char *buf)
{
return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
@@ -1669,12 +1770,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
case X86_BUG_SPECTRE_V2:
- return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
- ibpb_state(),
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
- stibp_state(),
- boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
- spectre_v2_module_string());
+ return spectre_v2_show_state(buf);
case X86_BUG_SPEC_STORE_BYPASS:
return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index c2a9762d278d..8a2b8e791314 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -310,11 +310,17 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
-static void mce_panic(const char *msg, struct mce *final, char *exp)
+static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{
- int apei_err = 0;
struct llist_node *pending;
struct mce_evt_llist *l;
+ int apei_err = 0;
+
+ /*
+ * Allow instrumentation around external facilities usage. Not that it
+ * matters a whole lot since the machine is going to panic anyway.
+ */
+ instrumentation_begin();
if (!fake_panic) {
/*
@@ -329,7 +335,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
} else {
/* Don't log too much for fake panic */
if (atomic_inc_return(&mce_fake_panicked) > 1)
- return;
+ goto out;
}
pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
@@ -367,6 +373,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+
+out:
+ instrumentation_end();
}
/* Support code for software error injection */
@@ -691,7 +700,7 @@ static struct notifier_block mce_default_nb = {
/*
* Read ADDR and MISC registers.
*/
-static void mce_read_aux(struct mce *m, int i)
+static noinstr void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(msr_ops.misc(i));
@@ -1071,10 +1080,13 @@ static int mce_start(int *no_way_out)
* Synchronize between CPUs after main scanning loop.
* This invokes the bulk of the Monarch processing.
*/
-static int mce_end(int order)
+static noinstr int mce_end(int order)
{
- int ret = -1;
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int ret = -1;
+
+ /* Allow instrumentation around external facilities. */
+ instrumentation_begin();
if (!timeout)
goto reset;
@@ -1118,7 +1130,8 @@ static int mce_end(int order)
/*
* Don't reset anything. That's done by the Monarch.
*/
- return 0;
+ ret = 0;
+ goto out;
}
/*
@@ -1133,6 +1146,10 @@ reset:
* Let others run again.
*/
atomic_set(&mce_executing, 0);
+
+out:
+ instrumentation_end();
+
return ret;
}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index eb2d41c1816d..e1fda5b19b6f 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -347,7 +347,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf,
char buf[MAX_FLAG_OPT_SIZE], *__buf;
int err;
- if (cnt > MAX_FLAG_OPT_SIZE)
+ if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 4a4198b806b4..c95a27513a30 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -772,9 +772,9 @@ static struct subsys_interface mc_cpu_interface = {
};
/**
- * mc_bp_resume - Update boot CPU microcode during resume.
+ * microcode_bsp_resume - Update boot CPU microcode during resume.
*/
-static void mc_bp_resume(void)
+void microcode_bsp_resume(void)
{
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -786,7 +786,7 @@ static void mc_bp_resume(void)
}
static struct syscore_ops mc_syscore_ops = {
- .resume = mc_bp_resume,
+ .resume = microcode_bsp_resume,
};
static int mc_cpu_starting(unsigned int cpu)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index adf9b71386ef..53004dbd55c4 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
{ X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
+ { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 6f6b1d04dadf..50225bc0383b 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -515,6 +515,7 @@ static const struct intel_early_ops gen11_early_ops __initconst = {
.stolen_size = gen9_stolen_size,
};
+/* Intel integrated GPUs for which we need to reserve "stolen memory" */
static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_I830_IDS(&i830_early_ops),
INTEL_I845G_IDS(&i845_early_ops),
@@ -587,6 +588,13 @@ static void __init intel_graphics_quirks(int num, int slot, int func)
u16 device;
int i;
+ /*
+ * Reserve "stolen memory" for an integrated GPU. If we've already
+ * found one, there's nothing to do for other (discrete) GPUs.
+ */
+ if (resource_size(&intel_graphics_stolen_res))
+ return;
+
device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) {
@@ -699,7 +707,7 @@ static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
- QFLAG_APPLY_ONCE, intel_graphics_quirks },
+ 0, intel_graphics_quirks },
/*
* HPET on the current version of the Baytrail platform has accuracy
* problems: it will halt in deep idle state - so we disable it.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6ff2c7cac4c4..f582dda8dd34 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -59,6 +59,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align
DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
static int has_steal_clock = 0;
+static int has_guest_poll = 0;
/*
* No need for any "IO delay" on KVM
*/
@@ -487,7 +488,7 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
ipi_bitmap <<= min - apic_id;
min = apic_id;
- } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
+ } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
max = apic_id < max ? max : apic_id;
} else {
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
@@ -584,14 +585,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
static int kvm_suspend(void)
{
+ u64 val = 0;
+
kvm_guest_cpu_offline(false);
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+ rdmsrl(MSR_KVM_POLL_CONTROL, val);
+ has_guest_poll = !(val & 1);
+#endif
return 0;
}
static void kvm_resume(void)
{
kvm_cpu_online(raw_smp_processor_id());
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
+ wrmsrl(MSR_KVM_POLL_CONTROL, 0);
+#endif
}
static struct syscore_ops kvm_syscore_ops = {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b8ceec4974fe..352f876950ab 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -229,14 +229,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
- struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_prepare(prev_fpu, cpu);
+ switch_fpu_prepare(prev_p, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -292,7 +290,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);
- switch_fpu_finish(next_fpu);
+ switch_fpu_finish(next_p);
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index da3cc3a10d63..633788362906 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -505,15 +505,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
- struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_prepare(prev_fpu, cpu);
+ switch_fpu_prepare(prev_p, cpu);
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
@@ -565,7 +563,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
- switch_fpu_finish(next_fpu);
+ switch_fpu_finish(next_p);
/* Reload sp0. */
update_task_stack(next_p);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d65d1afb2716..fdef27a84d71 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type)
spin_unlock(&rtc_lock);
/*
- * Switch back to the initial page table.
+ * Switch to the trampoline page table.
*/
-#ifdef CONFIG_X86_32
- load_cr3(initial_page_table);
-#else
- write_cr3(real_mode_header->trampoline_pgd);
-
- /* Exiting long mode will fail if CR4.PCIDE is set. */
- if (boot_cpu_has(X86_FEATURE_PCID))
- cr4_clear_bits(X86_CR4_PCIDE);
-#endif
+ load_trampoline_pgtable();
/* Jump to the identity-mapped low memory code */
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7e322e2daaf5..fe4200b89582 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1162,6 +1162,12 @@ void mark_tsc_unstable(char *reason)
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+static void __init tsc_disable_clocksource_watchdog(void)
+{
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1178,6 +1184,23 @@ static void __init check_system_tsc_reliable(void)
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+ /*
+ * Disable the clocksource watchdog when the system has:
+ * - TSC running at constant frequency
+ * - TSC which does not stop in C-States
+ * - the TSC_ADJUST register which allows to detect even minimal
+ * modifications
+ * - not more than two sockets. As the number of sockets cannot be
+ * evaluated at the early boot stage where this has to be
+ * invoked, check the number of online memory nodes as a
+ * fallback solution which is an reasonable estimate.
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ nr_online_nodes <= 2)
+ tsc_disable_clocksource_watchdog();
}
/*
@@ -1369,9 +1392,6 @@ static int __init init_tsc_clocksource(void)
if (tsc_unstable)
goto unreg;
- if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1506,7 +1526,7 @@ void __init tsc_init(void)
}
if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ tsc_disable_clocksource_watchdog();
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index ec534f978867..59b114306300 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -30,6 +30,7 @@ struct tsc_adjust {
};
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
/*
* TSC's on different sockets may be reset asynchronously.
@@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume)
}
}
+/*
+ * Normally the tsc_sync will be checked every time system enters idle
+ * state, but there is still caveat that a system won't enter idle,
+ * either because it's too busy or configured purposely to not enter
+ * idle.
+ *
+ * So setup a periodic timer (every 10 minutes) to make sure the check
+ * is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL (HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+ int next_cpu;
+
+ tsc_verify_tsc_adjust(false);
+
+ /* Run the check for all onlined CPUs in turn */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+
+ tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+ add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+ return 0;
+
+ timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+ tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+ add_timer(&tsc_sync_check_timer);
+
+ return 0;
+}
+late_initcall(start_sync_check_timer);
+
static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
unsigned int cpu, bool bootcpu)
{
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6a8db8eb0e94..62c7f771a7cf 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -592,6 +592,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
union cpuid10_eax eax;
union cpuid10_edx edx;
+ if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
perf_get_x86_pmu_capability(&cap);
/*
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 60c8dcb907a5..ea48a2fb1677 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1714,11 +1714,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
}
- if (!seg_desc.p) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
- goto exception;
- }
-
dpl = seg_desc.dpl;
switch (seg) {
@@ -1758,6 +1753,10 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_TR:
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
goto exception;
+ if (!seg_desc.p) {
+ err_vec = NP_VECTOR;
+ goto exception;
+ }
old_desc = seg_desc;
seg_desc.type |= 2; /* busy */
ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
@@ -1782,6 +1781,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
break;
}
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
if (seg_desc.s) {
/* mark segment as accessed */
if (!(seg_desc.type & 1)) {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 26408434b9bc..ca66459a2e89 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -205,7 +205,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
int ret;
- if (!synic->active && !host)
+ if (!synic->active && (!host || data))
return 1;
trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
@@ -251,6 +251,9 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
case HV_X64_MSR_EOM: {
int i;
+ if (!synic->active)
+ break;
+
for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
kvm_hv_notify_acked_sint(vcpu, i);
break;
@@ -514,6 +517,11 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
{
union hv_stimer_config new_config = {.as_uint64 = config},
old_config = {.as_uint64 = stimer->config.as_uint64};
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && (!host || config))
+ return 1;
trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, config, host);
@@ -533,6 +541,12 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
bool host)
{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && (!host || count))
+ return 1;
+
trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, count, host);
@@ -1501,11 +1515,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+ if (all_cpus)
+ goto check_and_send_ipi;
+
if (!sparse_banks_len)
goto ret_success;
- if (!all_cpus &&
- kvm_read_guest(kvm,
+ if (kvm_read_guest(kvm,
ingpa + offsetof(struct hv_send_ipi_ex,
vp_set.bank_contents),
sparse_banks,
@@ -1513,6 +1529,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
+check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index eea2d6f10f59..3696b4de9d99 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -118,7 +118,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
{
- return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
+ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
}
EXPORT_SYMBOL_GPL(kvm_can_post_timer_interrupt);
@@ -2099,10 +2100,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (kvm_lapic_get_reg(apic, APIC_TASKPRI) & 4));
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d4a8ad6c6a4b..1a1d2b5e7b35 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -34,9 +34,8 @@
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
- #define CMPXCHG cmpxchg
+ #define CMPXCHG "cmpxchgq"
#else
- #define CMPXCHG cmpxchg64
#define PT_MAX_FULL_LEVELS 2
#endif
#elif PTTYPE == 32
@@ -52,7 +51,7 @@
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
- #define CMPXCHG cmpxchg
+ #define CMPXCHG "cmpxchgl"
#elif PTTYPE == PTTYPE_EPT
#define pt_element_t u64
#define guest_walker guest_walkerEPT
@@ -65,8 +64,10 @@
#define PT_GUEST_DIRTY_SHIFT 9
#define PT_GUEST_ACCESSED_SHIFT 8
#define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
- #define CMPXCHG cmpxchg64
#define PT_MAX_FULL_LEVELS 4
+ #ifdef CONFIG_X86_64
+ #define CMPXCHG "cmpxchgq"
+ #endif
#else
#error Invalid PTTYPE value
#endif
@@ -132,43 +133,39 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t __user *ptep_user, unsigned index,
pt_element_t orig_pte, pt_element_t new_pte)
{
- int npages;
- pt_element_t ret;
- pt_element_t *table;
- struct page *page;
-
- npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
- if (likely(npages == 1)) {
- table = kmap_atomic(page);
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- kunmap_atomic(table);
-
- kvm_release_page_dirty(page);
- } else {
- struct vm_area_struct *vma;
- unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
- unsigned long pfn;
- unsigned long paddr;
-
- down_read(&current->mm->mmap_sem);
- vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE);
- if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
- up_read(&current->mm->mmap_sem);
- return -EFAULT;
- }
- pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- paddr = pfn << PAGE_SHIFT;
- table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
- if (!table) {
- up_read(&current->mm->mmap_sem);
- return -EFAULT;
- }
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- memunmap(table);
- up_read(&current->mm->mmap_sem);
- }
+ int r = -EFAULT;
+
+ if (!user_access_begin(ptep_user, sizeof(pt_element_t)))
+ return -EFAULT;
+
+#ifdef CMPXCHG
+ asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n"
+ "mov $0, %[r]\n"
+ "setnz %b[r]\n"
+ "2:"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : [ptr] "+m" (*ptep_user),
+ [old] "+a" (orig_pte),
+ [r] "+q" (r)
+ : [new] "r" (new_pte)
+ : "memory");
+#else
+ asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n"
+ "movl $0, %[r]\n"
+ "jz 2f\n"
+ "incl %[r]\n"
+ "2:"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : [ptr] "+m" (*ptep_user),
+ [old] "+A" (orig_pte),
+ [r] "+rm" (r)
+ : [new_lo] "b" ((u32)new_pte),
+ [new_hi] "c" ((u32)(new_pte >> 32))
+ : "memory");
+#endif
- return (ret != orig_pte);
+ user_access_end();
+ return r;
}
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 46875bbd0419..e0e3776059af 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -191,7 +191,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
}
if (type == PERF_TYPE_RAW)
- config = eventsel & X86_RAW_EVENT_MASK;
+ config = eventsel & AMD64_RAW_EVENT_MASK;
pmc_reprogram_counter(pmc, type, config,
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index c8388389a3b0..6bc656abbe66 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -245,12 +245,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* MSR_EVNTSELn */
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
- if (data == pmc->eventsel)
- return 0;
- if (!(data & pmu->reserved_bits)) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel)
reprogram_gp_counter(pmc, data);
- return 0;
- }
+ return 0;
}
return 1;
@@ -266,7 +264,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
- pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->reserved_bits = 0xfffffff000280000ull;
pmu->version = 1;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 425444d08071..125970286f28 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1904,7 +1904,8 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
uint8_t *page_virtual;
unsigned long i;
- if (npages == 0 || pages == NULL)
+ if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 ||
+ pages == NULL)
return;
for (i = 0; i < npages; i++) {
@@ -4585,8 +4586,6 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
break;
}
case AVIC_IPI_FAILURE_INVALID_TARGET:
- WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
- index, svm->vcpu.vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 07ebf6882a45..632bed227152 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -58,7 +58,9 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
SECONDARY_EXEC_SHADOW_VMCS | \
SECONDARY_EXEC_TSC_SCALING | \
SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0aaf40be956f..1f7dfa5aa42d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1218,7 +1218,7 @@ static const u32 msrs_to_save_all[] = {
MSR_IA32_UMWAIT_CONTROL,
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 1847e993ac63..f3f7f4cb15a6 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -142,7 +142,7 @@ void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
/* cache copy and flush to align dest */
if (!IS_ALIGNED(dest, 8)) {
- unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
+ size_t len = min_t(size_t, size, ALIGN(dest, 8) - dest);
memcpy((void *) dest, (void *) source, len);
clean_cache_range((void *) dest, len);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 281e584cfe39..d61313f5c5b9 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1967,7 +1967,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
/*
* Before changing the encryption attribute, we need to flush caches.
*/
- cpa_flush(&cpa, 1);
+ cpa_flush(&cpa, !this_cpu_has(X86_FEATURE_SME_COHERENT));
ret = __change_page_attr_set_clr(&cpa, 1);
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index aefe845dff59..6ca88fbc009c 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -279,7 +279,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
return;
}
- new = early_memremap(new_phys, new_size);
+ new = early_memremap_prot(new_phys, new_size,
+ pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL)));
if (!new) {
pr_err("Failed to map new boot services memmap\n");
return;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 915bb1639763..20dd7023132f 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -25,6 +25,7 @@
#include <asm/cpu.h>
#include <asm/mmu_context.h>
#include <asm/cpu_device_id.h>
+#include <asm/microcode.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -40,7 +41,8 @@ static void msr_save_context(struct saved_context *ctxt)
struct saved_msr *end = msr + ctxt->saved_msrs.num;
while (msr < end) {
- msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q);
+ if (msr->valid)
+ rdmsrl(msr->info.msr_no, msr->info.reg.q);
msr++;
}
}
@@ -262,6 +264,13 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
x86_platform.restore_sched_clock_state();
mtrr_bp_restore();
perf_restore_debug_store();
+
+ microcode_bsp_resume();
+
+ /*
+ * This needs to happen after the microcode has been updated upon resume
+ * because some of the MSRs are "emulated" in microcode.
+ */
msr_restore_context(ctxt);
}
@@ -421,8 +430,10 @@ static int msr_build_context(const u32 *msr_id, const int num)
}
for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) {
+ u64 dummy;
+
msr_array[i].info.msr_no = msr_id[j];
- msr_array[i].valid = false;
+ msr_array[i].valid = !rdmsrl_safe(msr_id[j], &dummy);
msr_array[i].info.reg.q = 0;
}
saved_msrs->num = total_num;
@@ -509,10 +520,24 @@ static int pm_cpu_check(const struct x86_cpu_id *c)
return ret;
}
+static void pm_save_spec_msr(void)
+{
+ u32 spec_msr_id[] = {
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_TSX_CTRL,
+ MSR_TSX_FORCE_ABORT,
+ MSR_IA32_MCU_OPT_CTRL,
+ MSR_AMD64_LS_CFG,
+ };
+
+ msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id));
+}
+
static int pm_check_save_msr(void)
{
dmi_check_system(msr_save_dmi_table);
pm_cpu_check(msr_save_cpu_table);
+ pm_save_spec_msr();
return 0;
}
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 262f83cad355..fac50ebb122b 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -16,6 +16,32 @@ u32 *trampoline_cr4_features;
/* Hold the pgd entry used on booting additional CPUs */
pgd_t trampoline_pgd_entry;
+void load_trampoline_pgtable(void)
+{
+#ifdef CONFIG_X86_32
+ load_cr3(initial_page_table);
+#else
+ /*
+ * This function is called before exiting to real-mode and that will
+ * fail with CR4.PCIDE still set.
+ */
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
+
+ write_cr3(real_mode_header->trampoline_pgd);
+#endif
+
+ /*
+ * The CR3 write above will not flush global TLB entries.
+ * Stale, global entries from previous page tables may still be
+ * present. Flush those stale entries.
+ *
+ * This ensures that memory accessed while running with
+ * trampoline_pgd is *actually* mapped into trampoline_pgd.
+ */
+ __flush_tlb_all();
+}
+
void __init reserve_real_mode(void)
{
phys_addr_t mem;
@@ -50,6 +76,7 @@ static void __init setup_real_mode(void)
#ifdef CONFIG_X86_64
u64 *trampoline_pgd;
u64 efer;
+ int i;
#endif
base = (unsigned char *)real_mode_header;
@@ -108,8 +135,17 @@ static void __init setup_real_mode(void)
trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+
+ /* Map the real mode stub as virtual == physical */
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_top_pgt[511].pgd;
+
+ /*
+ * Include the entirety of the kernel mapping into the trampoline
+ * PGD. This way, all mappings present in the normal kernel page
+ * tables are usable while running on trampoline_pgd.
+ */
+ for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = init_top_pgt[i].pgd;
#endif
}
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index 58f51667e2e4..8249685b4096 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -11,6 +11,7 @@
#include <linux/uaccess.h>
#include <asm/prctl.h> /* XXX This should get the constants from libc */
#include <os.h>
+#include <registers.h>
long arch_prctl(struct task_struct *task, int option,
unsigned long __user *arg2)
@@ -35,7 +36,7 @@ long arch_prctl(struct task_struct *task, int option,
switch (option) {
case ARCH_SET_FS:
case ARCH_SET_GS:
- ret = restore_registers(pid, &current->thread.regs.regs);
+ ret = restore_pid_registers(pid, &current->thread.regs.regs);
if (ret)
return ret;
break;
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index e13b0b49fcdf..d7249f4c90f1 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -512,10 +512,7 @@ irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
return ret;
}
-bool is_xen_pmu(int cpu)
-{
- return (get_xenpmu_data() != NULL);
-}
+bool is_xen_pmu;
void xen_pmu_init(int cpu)
{
@@ -526,7 +523,7 @@ void xen_pmu_init(int cpu)
BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
- if (xen_hvm_domain())
+ if (xen_hvm_domain() || (cpu != 0 && !is_xen_pmu))
return;
xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
@@ -547,7 +544,8 @@ void xen_pmu_init(int cpu)
per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
per_cpu(xenpmu_shared, cpu).flags = 0;
- if (cpu == 0) {
+ if (!is_xen_pmu) {
+ is_xen_pmu = true;
perf_register_guest_info_callbacks(&xen_guest_cbs);
xen_pmu_arch_init();
}
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
index 0e83a160589b..65c58894fc79 100644
--- a/arch/x86/xen/pmu.h
+++ b/arch/x86/xen/pmu.h
@@ -4,6 +4,8 @@
#include <xen/interface/xenpmu.h>
+extern bool is_xen_pmu;
+
irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
#ifdef CONFIG_XEN_HAVE_VPMU
void xen_pmu_init(int cpu);
@@ -12,7 +14,6 @@ void xen_pmu_finish(int cpu);
static inline void xen_pmu_init(int cpu) {}
static inline void xen_pmu_finish(int cpu) {}
#endif
-bool is_xen_pmu(int cpu);
bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
int pmu_apic_update(uint32_t reg);
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index f8d39440b292..e5bd9eb42191 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -19,6 +19,12 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void)
xen_vcpu_setup(0);
/*
+ * Called again in case the kernel boots on vcpu >= MAX_VIRT_CPUS.
+ * Refer to comments in xen_hvm_init_time_ops().
+ */
+ xen_hvm_init_time_ops();
+
+ /*
* The alternative logic (which patches the unlock/lock) runs before
* the smp bootup up code is activated. Hence we need to set this up
* the core kernel is being patched. Otherwise we will have only
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 0cebe5db691d..64e6ec2c32a7 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -129,7 +129,7 @@ int xen_smp_intr_init_pv(unsigned int cpu)
per_cpu(xen_irq_work, cpu).irq = rc;
per_cpu(xen_irq_work, cpu).name = callfunc_name;
- if (is_xen_pmu(cpu)) {
+ if (is_xen_pmu) {
pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
xen_pmu_irq_handler,
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index befbdd8b17f0..4ec6c80f7687 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -547,6 +547,11 @@ static void xen_hvm_setup_cpu_clockevents(void)
void __init xen_hvm_init_time_ops(void)
{
+ static bool hvm_time_initialized;
+
+ if (hvm_time_initialized)
+ return;
+
/*
* vector callback is needed otherwise we cannot receive interrupts
* on cpu > 0 and at this point we don't know how many cpus are
@@ -556,7 +561,22 @@ void __init xen_hvm_init_time_ops(void)
return;
if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
- pr_info("Xen doesn't support pvclock on HVM, disable pv timer");
+ pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
+ return;
+ }
+
+ /*
+ * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
+ * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
+ * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
+ * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
+ *
+ * The xen_hvm_init_time_ops() should be called again later after
+ * __this_cpu_read(xen_vcpu) is available.
+ */
+ if (!__this_cpu_read(xen_vcpu)) {
+ pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
+ xen_vcpu_nr(0));
return;
}
@@ -568,6 +588,8 @@ void __init xen_hvm_init_time_ops(void)
x86_platform.calibrate_tsc = xen_tsc_khz;
x86_platform.get_wallclock = xen_get_wallclock;
x86_platform.set_wallclock = xen_set_wallclock;
+
+ hvm_time_initialized = true;
}
#endif