summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c34
-rw-r--r--arch/x86/kvm/cpuid.h2
-rw-r--r--arch/x86/kvm/emulate.c74
-rw-r--r--arch/x86/kvm/hyperv.c11
-rw-r--r--arch/x86/kvm/ioapic.c15
-rw-r--r--arch/x86/kvm/irq_comm.c2
-rw-r--r--arch/x86/kvm/lapic.c20
-rw-r--r--arch/x86/kvm/mmu.c460
-rw-r--r--arch/x86/kvm/mmu.h21
-rw-r--r--arch/x86/kvm/mmutrace.h59
-rw-r--r--arch/x86/kvm/mtrr.c9
-rw-r--r--arch/x86/kvm/paging_tmpl.h79
-rw-r--r--arch/x86/kvm/pmu.c4
-rw-r--r--arch/x86/kvm/pmu.h18
-rw-r--r--arch/x86/kvm/pmu_intel.c37
-rw-r--r--arch/x86/kvm/svm.c37
-rw-r--r--arch/x86/kvm/trace.h4
-rw-r--r--arch/x86/kvm/vmx.c246
-rw-r--r--arch/x86/kvm/x86.c191
19 files changed, 960 insertions, 363 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c17d3893ae60..63c3ff9e74d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -279,13 +279,18 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
{
switch (func) {
case 0:
- entry->eax = 1; /* only one leaf currently */
+ entry->eax = 7;
++*nent;
break;
case 1:
entry->ecx = F(MOVBE);
++*nent;
break;
+ case 7:
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ if (index == 0)
+ entry->ecx = F(RDPID);
+ ++*nent;
default:
break;
}
@@ -355,7 +360,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
- F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD);
+ F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
+ F(AMD_SSB_NO) | F(AMD_STIBP);
/* cpuid 0xC0000001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -380,14 +386,15 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
- F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES);
+ F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) |
+ F(INTEL_STIBP) | F(MD_CLEAR);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
r = -E2BIG;
- if (*nent >= maxnent)
+ if (WARN_ON(*nent >= maxnent))
goto out;
do_cpuid_1_ent(entry, function, index);
@@ -464,8 +471,17 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
+
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
cpuid_mask(&entry->edx, CPUID_7_EDX);
+ if (boot_cpu_has(X86_FEATURE_IBPB) &&
+ boot_cpu_has(X86_FEATURE_IBRS))
+ entry->edx |= F(SPEC_CTRL);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ entry->edx |= F(INTEL_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ entry->edx |= F(SPEC_CTRL_SSBD);
/*
* We emulate ARCH_CAPABILITIES in software even
* if the host doesn't support it.
@@ -633,7 +649,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ebx |= F(VIRT_SSBD);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
- if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ /*
+ * The preference is to use SPEC CTRL MSR instead of the
+ * VIRT_SPEC MSR.
+ */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
entry->ebx |= F(VIRT_SSBD);
break;
}
@@ -676,6 +697,9 @@ out:
static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func,
u32 idx, int *nent, int maxnent, unsigned int type)
{
+ if (*nent >= maxnent)
+ return -E2BIG;
+
if (type == KVM_GET_EMULATED_CPUID)
return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 8a841b9d8f84..b2bf8e1d5782 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -176,7 +176,7 @@ static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
- if (best && (best->ebx & bit(X86_FEATURE_AMD_IBRS)))
+ if (best && (best->ebx & (bit(X86_FEATURE_AMD_IBRS | bit(X86_FEATURE_AMD_SSBD)))))
return true;
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD)));
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 510cfc06701a..da3cd734dee1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -21,6 +21,7 @@
*/
#include <linux/kvm_host.h>
+#include <linux/nospec.h>
#include "kvm_cache_regs.h"
#include <asm/kvm_emulate.h>
#include <linux/stringify.h>
@@ -2579,15 +2580,13 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
* CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
* supports long mode.
*/
- cr4 = ctxt->ops->get_cr(ctxt, 4);
if (emulator_has_longmode(ctxt)) {
struct desc_struct cs_desc;
/* Zero CR4.PCIDE before CR0.PG. */
- if (cr4 & X86_CR4_PCIDE) {
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PCIDE)
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
- cr4 &= ~X86_CR4_PCIDE;
- }
/* A 32-bit code segment is required to clear EFER.LMA. */
memset(&cs_desc, 0, sizeof(cs_desc));
@@ -2601,13 +2600,16 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
if (cr0 & X86_CR0_PE)
ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
- /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */
- if (cr4 & X86_CR4_PAE)
- ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+ if (emulator_has_longmode(ctxt)) {
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PAE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
- /* And finally go back to 32-bit mode. */
- efer = 0;
- ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+ }
smbase = ctxt->ops->get_smbase(ctxt);
if (emulator_has_longmode(ctxt))
@@ -3529,6 +3531,16 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc_aux = 0;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+ return emulate_gp(ctxt, 0);
+ ctxt->dst.val = tsc_aux;
+ return X86EMUL_CONTINUE;
+}
+
static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
{
u64 tsc = 0;
@@ -4389,10 +4401,20 @@ static const struct opcode group8[] = {
F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
};
+/*
+ * The "memory" destination is actually always a register, since we come
+ * from the register case of group9.
+ */
+static const struct gprefix pfx_0f_c7_7 = {
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+};
+
+
static const struct group_dual group9 = { {
N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
}, {
- N, N, N, N, N, N, N, N,
+ N, N, N, N, N, N, N,
+ GP(0, &pfx_0f_c7_7),
} };
static const struct opcode group11[] = {
@@ -5000,6 +5022,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
ctxt->fetch.ptr = ctxt->fetch.data;
ctxt->fetch.end = ctxt->fetch.data + insn_len;
ctxt->opcode_len = 1;
+ ctxt->intercept = x86_intercept_none;
if (insn_len > 0)
memcpy(ctxt->fetch.data, insn, insn_len);
else {
@@ -5052,16 +5075,28 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
ctxt->ad_bytes = def_ad_bytes ^ 6;
break;
case 0x26: /* ES override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_ES;
+ break;
case 0x2e: /* CS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_CS;
+ break;
case 0x36: /* SS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_SS;
+ break;
case 0x3e: /* DS override */
has_seg_override = true;
- ctxt->seg_override = (ctxt->b >> 3) & 3;
+ ctxt->seg_override = VCPU_SREG_DS;
break;
case 0x64: /* FS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_FS;
+ break;
case 0x65: /* GS override */
has_seg_override = true;
- ctxt->seg_override = ctxt->b & 7;
+ ctxt->seg_override = VCPU_SREG_GS;
break;
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
@@ -5145,10 +5180,15 @@ done_prefixes:
}
break;
case Escape:
- if (ctxt->modrm > 0xbf)
- opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
- else
+ if (ctxt->modrm > 0xbf) {
+ size_t size = ARRAY_SIZE(opcode.u.esc->high);
+ u32 index = array_index_nospec(
+ ctxt->modrm - 0xc0, size);
+
+ opcode = opcode.u.esc->high[index];
+ } else {
opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ }
break;
case InstrDual:
if ((ctxt->modrm >> 6) == 3)
@@ -5256,6 +5296,8 @@ done_prefixes:
ctxt->memopp->addr.mem.ea + ctxt->_eip);
done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 42b1c83741c8..5e837c96e93f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -28,6 +28,7 @@
#include <linux/kvm_host.h>
#include <linux/highmem.h>
+#include <linux/nospec.h>
#include <asm/apicdef.h>
#include <trace/events/kvm.h>
@@ -719,11 +720,12 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
u32 index, u64 *pdata)
{
struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ if (WARN_ON_ONCE(index >= size))
return -EINVAL;
- *pdata = hv->hv_crash_param[index];
+ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
return 0;
}
@@ -762,11 +764,12 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
u32 index, u64 data)
{
struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ if (WARN_ON_ONCE(index >= size))
return -EINVAL;
- hv->hv_crash_param[index] = data;
+ hv->hv_crash_param[array_index_nospec(index, size)] = data;
return 0;
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 5f810bb80802..aa34b16e62c2 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -36,6 +36,7 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/nospec.h>
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
@@ -73,13 +74,14 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
default:
{
u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
- u64 redir_content;
+ u64 redir_content = ~0ULL;
- if (redir_index < IOAPIC_NUM_PINS)
- redir_content =
- ioapic->redirtbl[redir_index].bits;
- else
- redir_content = ~0ULL;
+ if (redir_index < IOAPIC_NUM_PINS) {
+ u32 index = array_index_nospec(
+ redir_index, IOAPIC_NUM_PINS);
+
+ redir_content = ioapic->redirtbl[index].bits;
+ }
result = (ioapic->ioregsel & 0x1) ?
(redir_content >> 32) & 0xffffffff :
@@ -299,6 +301,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
ioapic_debug("change redir index %x val %x\n", index, val);
if (index >= IOAPIC_NUM_PINS)
return;
+ index = array_index_nospec(index, IOAPIC_NUM_PINS);
e = &ioapic->redirtbl[index];
mask_before = e->fields.mask;
/* Preserve read-only fields */
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6c0191615f23..cf8b3c17657a 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -436,7 +436,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
kvm_set_msi_irq(vcpu->kvm, entry, &irq);
- if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+ if (irq.trig_mode && kvm_apic_match_dest(vcpu, NULL, 0,
irq.dest_id, irq.dest_mode))
__set_bit(irq.vector, ioapic_handled_vectors);
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c8630569e392..3988e26af3b5 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -28,6 +28,7 @@
#include <linux/export.h>
#include <linux/math64.h>
#include <linux/slab.h>
+#include <linux/nospec.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/page.h>
@@ -531,9 +532,11 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
{
u8 val;
- if (pv_eoi_get_user(vcpu, &val) < 0)
+ if (pv_eoi_get_user(vcpu, &val) < 0) {
apic_debug("Can't read EOI MSR value: 0x%llx\n",
(unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ return false;
+ }
return val & 0x1;
}
@@ -1587,15 +1590,20 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
- case APIC_LVTERR:
+ case APIC_LVTERR: {
/* TODO: Check vector */
+ size_t size;
+ u32 index;
+
if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
-
- val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
+ size = ARRAY_SIZE(apic_lvt_mask);
+ index = array_index_nospec(
+ (reg - APIC_LVTT) >> 4, size);
+ val &= apic_lvt_mask[index];
kvm_lapic_set_reg(apic, reg, val);
-
break;
+ }
case APIC_LVTT:
if (!kvm_apic_sw_enabled(apic))
@@ -1992,7 +2000,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
- if (!apic_enabled(apic))
+ if (!kvm_apic_hw_enabled(apic))
return -1;
apic_update_ppr(apic);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 676edfc19a95..3a281a2decde 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,7 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/kthread.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -44,6 +45,30 @@
#include <asm/vmx.h>
#include <asm/kvm_page_track.h>
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+ .set = set_nx_huge_pages,
+ .get = param_get_bool,
+};
+
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+ .set = set_nx_huge_pages_recovery_ratio,
+ .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+ &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+
/*
* When setting this variable to true it enables Two-Dimensional-Paging
* where the hardware walks 2 page tables:
@@ -131,9 +156,6 @@ module_param(dbg, bool, 0644);
#include <trace/events/kvm.h>
-#define CREATE_TRACE_POINTS
-#include "mmutrace.h"
-
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
@@ -142,6 +164,20 @@ module_param(dbg, bool, 0644);
/* make pte_list_desc fit well in cache line */
#define PTE_LIST_EXT 3
+/*
+ * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ *
+ * For handle_mmio_page_fault only:
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ */
+enum {
+ RET_PF_RETRY = 0,
+ RET_PF_EMULATE = 1,
+ RET_PF_INVALID = 2,
+};
+
struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
@@ -179,14 +215,23 @@ static u64 __read_mostly shadow_mmio_mask;
static u64 __read_mostly shadow_present_mask;
static void mmu_spte_set(u64 *sptep, u64 spte);
+static bool is_executable_pte(u64 spte);
static void mmu_free_roots(struct kvm_vcpu *vcpu);
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
{
shadow_mmio_mask = mmio_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+static bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
+}
+
/*
* the low bit of the generation number is always presumed to be zero.
* This disables mmio caching during memslot updates. The concept is
@@ -324,6 +369,11 @@ static int is_last_spte(u64 pte, int level)
return 0;
}
+static bool is_executable_pte(u64 spte)
+{
+ return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
static kvm_pfn_t spte_to_pfn(u64 pte)
{
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -767,10 +817,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
{
- if (sp->role.direct)
- BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
- else
+ if (!sp->role.direct) {
sp->gfns[index] = gfn;
+ return;
+ }
+
+ if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
+ pr_err_ratelimited("gfn mismatch under direct page %llx "
+ "(expected %llx, got %llx)\n",
+ sp->gfn,
+ kvm_mmu_page_get_gfn(sp, index), gfn);
}
/*
@@ -829,6 +885,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_disallow_lpage(slot, gfn);
}
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (sp->lpage_disallowed)
+ return;
+
+ ++kvm->stat.nx_lpage_splits;
+ list_add_tail(&sp->lpage_disallowed_link,
+ &kvm->arch.lpage_disallowed_mmu_pages);
+ sp->lpage_disallowed = true;
+}
+
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
@@ -846,6 +913,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ --kvm->stat.nx_lpage_splits;
+ sp->lpage_disallowed = false;
+ list_del(&sp->lpage_disallowed_link);
+}
+
static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
@@ -2382,6 +2456,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
kvm_reload_remote_mmus(kvm);
}
+ if (sp->lpage_disallowed)
+ unaccount_huge_nx_page(kvm, sp);
+
sp->role.invalid = 1;
return ret;
}
@@ -2533,6 +2610,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (!speculative)
spte |= shadow_accessed_mask;
+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled()) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
@@ -2598,13 +2680,13 @@ done:
return ret;
}
-static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
- bool speculative, bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
- bool emulate = false;
+ int ret = RET_PF_RETRY;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2634,18 +2716,15 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
- emulate = true;
+ ret = RET_PF_EMULATE;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (unlikely(is_mmio_spte(*sptep)))
- emulate = true;
+ ret = RET_PF_EMULATE;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
- is_large_pte(*sptep)? "2MB" : "4kB",
- *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
- *sptep, sptep);
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
@@ -2657,9 +2736,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
}
}
- kvm_release_pfn_clean(pfn);
-
- return emulate;
+ return ret;
}
static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2693,9 +2770,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
if (ret <= 0)
return -1;
- for (i = 0; i < ret; i++, gfn++, start++)
+ for (i = 0; i < ret; i++, gfn++, start++) {
mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);
+ put_page(pages[i]);
+ }
return 0;
}
@@ -2743,40 +2822,71 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
- int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
{
- struct kvm_shadow_walk_iterator iterator;
+ int level = *levelp;
+ u64 spte = *it.sptep;
+
+ if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+ is_nx_huge_page_enabled() &&
+ is_shadow_present_pte(spte) &&
+ !is_large_pte(spte)) {
+ /*
+ * A small SPTE exists for this pfn, but FNAME(fetch)
+ * and __direct_map would like to create a large PTE
+ * instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of
+ * the address.
+ */
+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+ *pfnp |= gfn & page_mask;
+ (*levelp)--;
+ }
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+ int map_writable, int level, kvm_pfn_t pfn,
+ bool prefault, bool lpage_disallowed)
+{
+ struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int emulate = 0;
- gfn_t pseudo_gfn;
+ int ret;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ gfn_t base_gfn = gfn;
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return 0;
+ return RET_PF_RETRY;
- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
- if (iterator.level == level) {
- emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
- write, level, gfn, pfn, prefault,
- map_writable);
- direct_pte_prefetch(vcpu, iterator.sptep);
- ++vcpu->stat.pf_fixed;
- break;
- }
+ trace_kvm_mmu_spte_requested(gpa, level, pfn);
+ for_each_shadow_entry(vcpu, gpa, it) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
- drop_large_spte(vcpu, iterator.sptep);
- if (!is_shadow_present_pte(*iterator.sptep)) {
- u64 base_addr = iterator.addr;
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == level)
+ break;
- base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
- pseudo_gfn = base_addr >> PAGE_SHIFT;
- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
- iterator.level - 1, 1, ACC_ALL);
+ drop_large_spte(vcpu, it.sptep);
+ if (!is_shadow_present_pte(*it.sptep)) {
+ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+ it.level - 1, true, ACC_ALL);
- link_shadow_page(vcpu, iterator.sptep, sp);
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
}
}
- return emulate;
+
+ ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
+ write, level, base_gfn, pfn, prefault,
+ map_writable);
+ direct_pte_prefetch(vcpu, it.sptep);
+ ++vcpu->stat.pf_fixed;
+ return ret;
}
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2798,25 +2908,23 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
* caused mmio page fault and treat it as mmio access.
- * Return 1 to tell kvm to emulate it.
*/
if (pfn == KVM_PFN_ERR_RO_FAULT)
- return 1;
+ return RET_PF_EMULATE;
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
- return 0;
+ return RET_PF_RETRY;
}
return -EFAULT;
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
- gfn_t *gfnp, kvm_pfn_t *pfnp,
+ gfn_t gfn, kvm_pfn_t *pfnp,
int *levelp)
{
kvm_pfn_t pfn = *pfnp;
- gfn_t gfn = *gfnp;
int level = *levelp;
/*
@@ -2826,7 +2934,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
* here.
*/
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
- level == PT_PAGE_TABLE_LEVEL &&
+ !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
PageTransCompoundMap(pfn_to_page(pfn)) &&
!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
@@ -2843,8 +2951,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
mask = KVM_PAGES_PER_HPAGE(level) - 1;
VM_BUG_ON((gfn & mask) != (pfn & mask));
if (pfn & mask) {
- gfn &= ~mask;
- *gfnp = gfn;
kvm_release_pfn_clean(pfn);
pfn &= ~mask;
kvm_get_pfn(pfn);
@@ -3012,11 +3118,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
{
int r;
int level;
- bool force_pt_level = false;
+ bool force_pt_level;
kvm_pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, write = error_code & PFERR_WRITE_MASK;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ force_pt_level = lpage_disallowed;
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
/*
@@ -3031,32 +3140,30 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
}
if (fast_page_fault(vcpu, v, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
return r;
+ r = RET_PF_RETRY;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
-
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+ r = __direct_map(vcpu, v, write, map_writable, level, pfn,
+ prefault, false);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return r;
}
@@ -3383,38 +3490,38 @@ exit:
return reserved;
}
-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
if (mmio_info_in_cache(vcpu, addr, direct))
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
if (WARN_ON(reserved))
- return RET_MMIO_PF_BUG;
+ return -EINVAL;
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
if (!check_mmio_spte(vcpu, spte))
- return RET_MMIO_PF_INVALID;
+ return RET_PF_INVALID;
if (direct)
addr = 0;
trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
}
/*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
- return RET_MMIO_PF_RETRY;
+ return RET_PF_RETRY;
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
@@ -3464,7 +3571,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3548,18 +3655,21 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
bool map_writable;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
- PT_DIRECTORY_LEVEL);
+ force_pt_level =
+ lpage_disallowed ||
+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
if (level > PT_DIRECTORY_LEVEL &&
@@ -3569,32 +3679,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
}
if (fast_page_fault(vcpu, gpa, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
return r;
+ r = RET_PF_RETRY;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
-
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+ prefault, lpage_disallowed);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return r;
}
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4510,23 +4618,24 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
enum emulation_result er;
bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+ r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, cr2, direct);
- if (r == RET_MMIO_PF_EMULATE) {
+ if (r == RET_PF_EMULATE) {
emulation_type = 0;
goto emulate;
}
- if (r == RET_MMIO_PF_RETRY)
- return 1;
- if (r < 0)
- return r;
}
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+ if (r == RET_PF_INVALID) {
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+ WARN_ON(r == RET_PF_INVALID);
+ }
+
+ if (r == RET_PF_RETRY)
+ return 1;
if (r < 0)
return r;
- if (!r)
- return 1;
if (mmio_info_in_cache(vcpu, cr2, direct))
emulation_type = 0;
@@ -4781,9 +4890,9 @@ restart:
* the guest, and the guest page table is using 4K page size
* mapping if the indirect sp has level = 1.
*/
- if (sp->role.direct &&
- !kvm_is_reserved_pfn(pfn) &&
- PageTransCompoundMap(pfn_to_page(pfn))) {
+ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+ !kvm_is_zone_device_pfn(pfn) &&
+ PageTransCompoundMap(pfn_to_page(pfn))) {
drop_spte(kvm, sptep);
need_tlb_flush = 1;
goto restart;
@@ -4965,7 +5074,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
int nr_to_scan = sc->nr_to_scan;
unsigned long freed = 0;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
int idx;
@@ -5015,7 +5124,7 @@ unlock:
break;
}
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
return freed;
}
@@ -5039,8 +5148,58 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+static bool get_nx_auto_mode(void)
+{
+ /* Return true when CPU has the bug, and mitigations are ON */
+ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+ bool old_val = nx_huge_pages;
+ bool new_val;
+
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
+ if (sysfs_streq(val, "off"))
+ new_val = 0;
+ else if (sysfs_streq(val, "force"))
+ new_val = 1;
+ else if (sysfs_streq(val, "auto"))
+ new_val = get_nx_auto_mode();
+ else if (strtobool(val, &new_val) < 0)
+ return -EINVAL;
+
+ __set_nx_huge_pages(new_val);
+
+ if (new_val != old_val) {
+ struct kvm *kvm;
+ int idx;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ idx = srcu_read_lock(&kvm->srcu);
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ }
+ mutex_unlock(&kvm_lock);
+ }
+
+ return 0;
+}
+
int kvm_mmu_module_init(void)
{
+ if (nx_huge_pages == -1)
+ __set_nx_huge_pages(get_nx_auto_mode());
+
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, SLAB_ACCOUNT, NULL);
@@ -5104,3 +5263,116 @@ void kvm_mmu_module_exit(void)
unregister_shrinker(&mmu_shrinker);
mmu_audit_disable();
}
+
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+{
+ unsigned int old_val;
+ int err;
+
+ old_val = nx_huge_pages_recovery_ratio;
+ err = param_set_uint(val, kp);
+ if (err)
+ return err;
+
+ if (READ_ONCE(nx_huge_pages) &&
+ !old_val && nx_huge_pages_recovery_ratio) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+
+ mutex_unlock(&kvm_lock);
+ }
+
+ return err;
+}
+
+static void kvm_recover_nx_lpages(struct kvm *kvm)
+{
+ int rcu_idx;
+ struct kvm_mmu_page *sp;
+ unsigned int ratio;
+ LIST_HEAD(invalid_list);
+ ulong to_zap;
+
+ rcu_idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+ /*
+ * We use a separate list instead of just using active_mmu_pages
+ * because the number of lpage_disallowed pages is expected to
+ * be relatively small compared to the total.
+ */
+ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ struct kvm_mmu_page,
+ lpage_disallowed_link);
+ WARN_ON_ONCE(!sp->lpage_disallowed);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ WARN_ON_ONCE(sp->lpage_disallowed);
+
+ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ if (to_zap)
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_lpage_recovery_timeout(u64 start_time)
+{
+ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
+ ? start_time + 60 * HZ - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+ u64 start_time;
+ long remaining_time;
+
+ while (true) {
+ start_time = get_jiffies_64();
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop() && remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (kthread_should_stop())
+ return 0;
+
+ kvm_recover_nx_lpages(kvm);
+ }
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ int err;
+
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ "kvm-nx-lpage-recovery",
+ &kvm->arch.nx_lpage_recovery_thread);
+ if (!err)
+ kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+
+ return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.nx_lpage_recovery_thread)
+ kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c92834c55c59..e584689e7d46 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -56,23 +56,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-/*
- * Return values of handle_mmio_page_fault:
- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
- * directly.
- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
- * fault path update the mmio spte.
- * RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
- */
-enum {
- RET_MMIO_PF_EMULATE = 1,
- RET_MMIO_PF_INVALID = 2,
- RET_MMIO_PF_RETRY = 0,
- RET_MMIO_PF_BUG = -1
-};
-
-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
@@ -202,4 +185,8 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn);
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
#endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 5a24b846a1cb..756b14ecc957 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -322,6 +322,65 @@ TRACE_EVENT(
__entry->kvm_gen == __entry->spte_gen
)
);
+
+TRACE_EVENT(
+ kvm_mmu_set_spte,
+ TP_PROTO(int level, gfn_t gfn, u64 *sptep),
+ TP_ARGS(level, gfn, sptep),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(u64, sptep)
+ __field(u8, level)
+ /* These depend on page entry type, so compute them now. */
+ __field(bool, r)
+ __field(bool, x)
+ __field(u8, u)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = *sptep;
+ __entry->sptep = virt_to_phys(sptep);
+ __entry->level = level;
+ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
+ __entry->x = is_executable_pte(__entry->spte);
+ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+ ),
+
+ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+ __entry->gfn, __entry->spte,
+ __entry->r ? "r" : "-",
+ __entry->spte & PT_PRESENT_MASK ? "w" : "-",
+ __entry->x ? "x" : "-",
+ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+ __entry->level, __entry->sptep
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_spte_requested,
+ TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),
+ TP_ARGS(addr, level, pfn),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, pfn)
+ __field(u8, level)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = addr >> PAGE_SHIFT;
+ __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
+ __entry->level = level;
+ ),
+
+ TP_printk("gfn %llx pfn %llx level %d",
+ __entry->gfn, __entry->pfn, __entry->level
+ )
+);
+
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 0149ac59c273..3e3016411020 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -17,6 +17,7 @@
*/
#include <linux/kvm_host.h>
+#include <linux/nospec.h>
#include <asm/mtrr.h>
#include "cpuid.h"
@@ -202,11 +203,15 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
break;
case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
*seg = 1;
- *unit = msr - MSR_MTRRfix16K_80000;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix16K_80000,
+ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
break;
case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
*seg = 2;
- *unit = msr - MSR_MTRRfix4K_C0000;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix4K_C0000,
+ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
break;
default:
return false;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 37363900297d..e03225e707b2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -499,6 +499,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
true, true);
+ kvm_release_pfn_clean(pfn);
return true;
}
@@ -572,12 +573,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int write_fault, int hlevel,
- kvm_pfn_t pfn, bool map_writable, bool prefault)
+ kvm_pfn_t pfn, bool map_writable, bool prefault,
+ bool lpage_disallowed)
{
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, emulate;
+ int top_level, ret;
+ gfn_t gfn, base_gfn;
direct_access = gw->pte_access;
@@ -622,36 +625,49 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
- for (;
- shadow_walk_okay(&it) && it.level > hlevel;
- shadow_walk_next(&it)) {
- gfn_t direct_gfn;
+ /*
+ * FNAME(page_fault) might have clobbered the bottom bits of
+ * gw->gfn, restore them from the virtual address.
+ */
+ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
+ base_gfn = gfn;
+ trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
+
+ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
clear_sp_write_flooding_count(it.sptep);
- validate_direct_spte(vcpu, it.sptep, direct_access);
- drop_large_spte(vcpu, it.sptep);
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
- if (is_shadow_present_pte(*it.sptep))
- continue;
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == hlevel)
+ break;
- direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ validate_direct_spte(vcpu, it.sptep, direct_access);
- sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
- true, direct_access);
- link_shadow_page(vcpu, it.sptep, sp);
+ drop_large_spte(vcpu, it.sptep);
+
+ if (!is_shadow_present_pte(*it.sptep)) {
+ sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
+ it.level - 1, true, direct_access);
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
+ }
}
- clear_sp_write_flooding_count(it.sptep);
- emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
- it.level, gw->gfn, pfn, prefault, map_writable);
+ ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
+ it.level, base_gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
-
- return emulate;
+ ++vcpu->stat.pf_fixed;
+ return ret;
out_gpte_changed:
- kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
/*
@@ -717,9 +733,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int r;
kvm_pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
- bool force_pt_level = false;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ bool force_pt_level = lpage_disallowed;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -746,12 +764,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!prefault)
inject_page_fault(vcpu, &walker.fault);
- return 0;
+ return RET_PF_RETRY;
}
if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
shadow_page_table_clear_flood(vcpu, addr);
- return 1;
+ return RET_PF_EMULATE;
}
vcpu->arch.write_fault_to_shadow_pgtable = false;
@@ -773,7 +791,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
&map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
walker.gfn, pfn, walker.pte_access, &r))
@@ -799,6 +817,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
walker.pte_access &= ~ACC_EXEC_MASK;
}
+ r = RET_PF_RETRY;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
@@ -806,19 +825,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
make_mmu_pages_available(vcpu);
if (!force_pt_level)
- transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
+ transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
- level, pfn, map_writable, prefault);
- ++vcpu->stat.pf_fixed;
+ level, pfn, map_writable, prefault, lpage_disallowed);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return r;
}
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 06ce377dcbc9..0827ee7d0e9b 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -124,8 +124,8 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
intr ? kvm_perf_overflow_intr :
kvm_perf_overflow, pmc);
if (IS_ERR(event)) {
- printk_once("kvm_pmu: event creation failed %ld\n",
- PTR_ERR(event));
+ pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
+ PTR_ERR(event), pmc->idx);
return;
}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index f96e1f962587..fbf3d25af765 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -1,6 +1,8 @@
#ifndef __KVM_X86_PMU_H
#define __KVM_X86_PMU_H
+#include <linux/nospec.h>
+
#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
@@ -80,8 +82,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
u32 base)
{
- if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
- return &pmu->gp_counters[msr - base];
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_gp_counters);
+
+ return &pmu->gp_counters[index];
+ }
return NULL;
}
@@ -91,8 +97,12 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
{
int base = MSR_CORE_PERF_FIXED_CTR0;
- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
- return &pmu->fixed_counters[msr - base];
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_fixed_counters);
+
+ return &pmu->fixed_counters[index];
+ }
return NULL;
}
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index 5ab4a364348e..84ae4dd261ca 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -87,10 +87,14 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
static unsigned intel_find_fixed_event(int idx)
{
- if (idx >= ARRAY_SIZE(fixed_pmc_events))
+ u32 event;
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+
+ if (idx >= size)
return PERF_COUNT_HW_MAX;
- return intel_arch_events[fixed_pmc_events[idx]].event_type;
+ event = fixed_pmc_events[array_index_nospec(idx, size)];
+ return intel_arch_events[event].event_type;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -131,15 +135,19 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
struct kvm_pmc *counters;
+ unsigned int num_counters;
idx &= ~(3u << 30);
- if (!fixed && idx >= pmu->nr_arch_gp_counters)
- return NULL;
- if (fixed && idx >= pmu->nr_arch_fixed_counters)
+ if (fixed) {
+ counters = pmu->fixed_counters;
+ num_counters = pmu->nr_arch_fixed_counters;
+ } else {
+ counters = pmu->gp_counters;
+ num_counters = pmu->nr_arch_gp_counters;
+ }
+ if (idx >= num_counters)
return NULL;
- counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
-
- return &counters[idx];
+ return &counters[array_index_nospec(idx, num_counters)];
}
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
@@ -235,11 +243,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
break;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
- (pmc = get_fixed_pmc(pmu, msr))) {
- if (!msr_info->host_initiated)
- data = (s64)(s32)data;
- pmc->counter += data - pmc_read_counter(pmc);
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ if (msr_info->host_initiated)
+ pmc->counter = data;
+ else
+ pmc->counter = (s32)data;
+ return 0;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ pmc->counter = data;
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 01eb0451b96d..1079228e4fef 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -590,8 +590,14 @@ static int get_npt_level(void)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
vcpu->arch.efer = efer;
- if (!npt_enabled && !(efer & EFER_LMA))
- efer &= ~EFER_LME;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
@@ -1518,7 +1524,11 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
- if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
+ /*
+ * Since the host physical APIC id is 8 bits,
+ * we can support host APIC ID upto 255.
+ */
+ if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
@@ -3704,7 +3714,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
return 1;
/* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
return 1;
svm->spec_ctrl = data;
@@ -3940,14 +3950,25 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
kvm_lapic_reg_write(apic, APIC_ICR, icrl);
break;
case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm *kvm = svm->vcpu.kvm;
struct kvm_lapic *apic = svm->vcpu.arch.apic;
/*
- * Update ICR high and low, then emulate sending IPI,
- * which is handled when writing APIC_ICR.
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
*/
- kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
- kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ bool m = kvm_apic_match_dest(vcpu, apic,
+ icrl & KVM_APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & KVM_APIC_DEST_MASK);
+
+ if (m && !avic_vcpu_is_running(vcpu))
+ kvm_vcpu_wake_up(vcpu);
+ }
break;
}
case AVIC_IPI_FAILURE_INVALID_TARGET:
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0a6cc6754ec5..ea618b713b6f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -434,13 +434,13 @@ TRACE_EVENT(kvm_apic_ipi,
);
TRACE_EVENT(kvm_apic_accept_irq,
- TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec),
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
TP_ARGS(apicid, dm, tm, vec),
TP_STRUCT__entry(
__field( __u32, apicid )
__field( __u16, dm )
- __field( __u8, tm )
+ __field( __u16, tm )
__field( __u8, vec )
),
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a34fb7284024..2ad59d8553a5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -714,7 +714,6 @@ struct vcpu_vmx {
u64 msr_guest_kernel_gs_base;
#endif
- u64 arch_capabilities;
u64 spec_ctrl;
u32 vm_entry_controls_shadow;
@@ -1548,7 +1547,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
return -1;
}
-static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
{
struct {
u64 vpid : 16;
@@ -1562,7 +1561,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
: : "a"(&operand), "c"(ext) : "cc", "memory");
}
-static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
{
struct {
u64 eptp, gpa;
@@ -1620,43 +1619,15 @@ static void vmcs_load(struct vmcs *vmcs)
}
#ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
- cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
- cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
- return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
static void crash_vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
- if (!crash_local_vmclear_enabled(cpu))
- return;
-
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
vmcs_clear(v->vmcs);
}
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
@@ -1668,19 +1639,24 @@ static void __loaded_vmcs_clear(void *arg)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- crash_disable_local_vmclear(cpu);
+
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
/*
- * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
- * is before setting loaded_vmcs->vcpu to -1 which is done in
- * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
- * then adds the vmcs into percpu list before it is deleted.
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->vcpu to
+ * -1, otherwise a different cpu can see vcpu == -1 first and add
+ * loaded_vmcs to its percpu list before it's deleted from this cpu's
+ * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
- loaded_vmcs_init(loaded_vmcs);
- crash_enable_local_vmclear(cpu);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
}
static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -2220,17 +2196,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
- if (!enable_ept) {
- /*
- * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
- * host CPUID is more efficient than testing guest CPUID
- * or CR4. Host SMEP is anyway a requirement for guest SMEP.
- */
- if (boot_cpu_has(X86_FEATURE_SMEP))
- guest_efer |= EFER_NX;
- else if (!(guest_efer & EFER_NX))
- ignore_bits |= EFER_NX;
- }
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
/*
* LMA and LME handled by hardware; SCE meaningless outside long mode.
@@ -2480,18 +2448,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (!already_loaded) {
local_irq_disable();
- crash_disable_local_vmclear(cpu);
/*
- * Read loaded_vmcs->cpu should be before fetching
- * loaded_vmcs->loaded_vmcss_on_cpu_link.
- * See the comments in __loaded_vmcs_clear().
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
*/
smp_rmb();
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
- crash_enable_local_vmclear(cpu);
local_irq_enable();
}
@@ -3209,12 +3176,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = to_vmx(vcpu)->spec_ctrl;
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has_arch_capabilities(vcpu))
- return 1;
- msr_info->data = to_vmx(vcpu)->arch_capabilities;
- break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3376,11 +3337,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
MSR_TYPE_W);
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated)
- return 1;
- vmx->arch_capabilities = data;
- break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -3526,21 +3482,6 @@ static int hardware_enable(void)
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-
- /*
- * Now we can enable the vmclear operation in kdump
- * since the loaded_vmcss_on_cpu list on this cpu
- * has been initialized.
- *
- * Though the cpu is not in VMX operation now, there
- * is no problem to enable the vmclear operation
- * for the loaded_vmcss_on_cpu list is empty!
- */
- crash_enable_local_vmclear(cpu);
-
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
@@ -4661,6 +4602,26 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
(ss.selector & SEGMENT_RPL_MASK));
}
+static bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu,
+ unsigned int port, int size);
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification;
+ unsigned short port;
+ int size;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
/*
* Check if guest state is valid. Returns true if valid, false if
* not.
@@ -5468,8 +5429,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
- vmx->arch_capabilities = kvm_get_arch_capabilities();
-
vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
/* 22.2.1, 20.8.1 */
@@ -6566,20 +6525,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
return 1;
}
else
- return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
- NULL, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, EMULTYPE_SKIP) ==
+ EMULATE_DONE;
}
- ret = handle_mmio_page_fault(vcpu, gpa, true);
- if (likely(ret == RET_MMIO_PF_EMULATE))
- return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
- EMULATE_DONE;
-
- if (unlikely(ret == RET_MMIO_PF_INVALID))
- return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
-
- if (unlikely(ret == RET_MMIO_PF_RETRY))
- return 1;
+ ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
+ if (ret >= 0)
+ return ret;
/* It is the real ept misconfig */
WARN_ON(1);
@@ -7653,6 +7605,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
gva_t gva = 0;
+ struct x86_exception e;
if (!nested_vmx_check_permission(vcpu) ||
!nested_vmx_check_vmcs12(vcpu))
@@ -7679,8 +7632,12 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
vmx_instruction_info, true, &gva))
return 1;
/* _system ok, as nested_vmx_check_permission verified cpl=0 */
- kvm_write_guest_virt_system(vcpu, gva, &field_value,
- (is_long_mode(vcpu) ? 8 : 4), NULL);
+ if (kvm_write_guest_virt_system(vcpu, gva, &field_value,
+ (is_long_mode(vcpu) ? 8 : 4),
+ &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
}
nested_vmx_succeed(vcpu);
@@ -8050,23 +8007,17 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+/*
+ * Return true if an IO instruction with the specified port and size should cause
+ * a VM-exit into L1.
+ */
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size)
{
- unsigned long exit_qualification;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
gpa_t bitmap, last_bitmap;
- unsigned int port;
- int size;
u8 b;
- if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
- return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- port = exit_qualification >> 16;
- size = (exit_qualification & 7) + 1;
-
last_bitmap = (gpa_t)-1;
b = -1;
@@ -9220,8 +9171,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->__launched = vmx->loaded_vmcs->launched;
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
+ else if (static_branch_unlikely(&mds_user_clear))
+ mds_clear_cpu_buffers();
asm(
/* Store host registers */
@@ -9580,8 +9534,8 @@ free_vcpu:
return ERR_PTR(err);
}
-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
static int vmx_vm_init(struct kvm *kvm)
{
@@ -11356,11 +11310,71 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
}
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned short port;
+ bool intercept;
+ int size;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ port = info->src_val;
+ size = info->dst_bytes;
+ } else {
+ port = info->dst_val;
+ size = info->src_bytes;
+ }
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ intercept = nested_cpu_has(vmcs12,
+ CPU_BASED_UNCOND_IO_EXITING);
+ else
+ intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
static int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
- return X86EMUL_CONTINUE;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+ switch (info->intercept) {
+ /*
+ * RDPID causes #UD if disabled through secondary execution controls.
+ * Because it is marked as EmulateOnUD, we need to intercept it here.
+ */
+ case x86_intercept_rdtscp:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->exception.error_code_valid = false;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ break;
+
+ case x86_intercept_in:
+ case x86_intercept_ins:
+ case x86_intercept_out:
+ case x86_intercept_outs:
+ return vmx_check_intercept_io(vcpu, info);
+
+ /* TODO: check more intercepts... */
+ default:
+ break;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
}
#ifdef CONFIG_X86_64
@@ -11842,7 +11856,7 @@ module_exit(vmx_exit)
static int __init vmx_init(void)
{
- int r;
+ int r, cpu;
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
@@ -11864,6 +11878,12 @@ static int __init vmx_init(void)
}
}
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ }
+
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a29df9ccbfde..314eb954bdee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,7 @@
#include <linux/pvclock_gtod.h>
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
+#include <linux/nospec.h>
#include <trace/events/kvm.h>
#include <asm/debugreg.h>
@@ -191,6 +192,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
+ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits) },
{ NULL }
};
@@ -272,13 +274,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
int err;
- if (((value ^ smsr->values[slot].curr) & mask) == 0)
+ value = (value & mask) | (smsr->values[slot].host & ~mask);
+ if (value == smsr->values[slot].curr)
return 0;
- smsr->values[slot].curr = value;
err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
if (err)
return 1;
+ smsr->values[slot].curr = value;
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
@@ -535,8 +538,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
data, offset, len, access);
}
+static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
+ rsvd_bits(1, 2);
+}
+
/*
- * Load the pae pdptrs. Return true is they are all valid.
+ * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
*/
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
{
@@ -555,8 +564,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & PT_PRESENT_MASK) &&
- (pdpte[i] &
- vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
+ (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
ret = 0;
goto out;
}
@@ -582,7 +590,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
gfn_t gfn;
int r;
- if (is_long_mode(vcpu) || !is_pae(vcpu))
+ if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))
return false;
if (!test_bit(VCPU_EXREG_PDPTR,
@@ -882,9 +890,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- vcpu->arch.db[dr] = val;
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = val;
break;
@@ -921,9 +931,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- *val = vcpu->arch.db[dr];
+ *val = vcpu->arch.db[array_index_nospec(dr, size)];
break;
case 4:
/* fall through */
@@ -1027,6 +1039,14 @@ u64 kvm_get_arch_capabilities(void)
rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
/*
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+ * the nested hypervisor runs with NX huge pages. If it is not,
+ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+ * L1 guests, so it need not worry about its own (L2) guests.
+ */
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
+
+ /*
* If we're doing cache flushes (either "always" or "cond")
* we will do one whenever the guest does a vmlaunch/vmresume.
* If an outer hypervisor is doing the cache flush for us
@@ -1038,8 +1058,40 @@ u64 kvm_get_arch_capabilities(void)
if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ data |= ARCH_CAP_RDCL_NO;
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ data |= ARCH_CAP_SSB_NO;
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ data |= ARCH_CAP_MDS_NO;
+
+ /*
+ * On TAA affected systems, export MDS_NO=0 when:
+ * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
+ * - Updated microcode is present. This is detected by
+ * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
+ * that VERW clears CPU buffers.
+ *
+ * When MDS_NO=0 is exported, guests deploy clear CPU buffer
+ * mitigation and don't complain:
+ *
+ * "Vulnerable: Clear CPU buffers attempted, no microcode"
+ *
+ * If TSX is disabled on the system, guests are also mitigated against
+ * TAA and clear CPU buffer mitigation is not required for guests.
+ */
+ if (!boot_cpu_has(X86_FEATURE_RTM))
+ data &= ~ARCH_CAP_TAA_NO;
+ else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ data |= ARCH_CAP_TAA_NO;
+ else if (data & ARCH_CAP_TSX_CTRL_MSR)
+ data &= ~ARCH_CAP_MDS_NO;
+
+ /* KVM does not emulate MSR_IA32_TSX_CTRL. */
+ data &= ~ARCH_CAP_TSX_CTRL_MSR;
return data;
}
+
EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
@@ -1073,11 +1125,8 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
return 0;
}
-bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (efer & efer_reserved_bits)
- return false;
-
if (efer & EFER_FFXSR) {
struct kvm_cpuid_entry2 *feat;
@@ -1095,19 +1144,33 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
}
return true;
+
+}
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & efer_reserved_bits)
+ return false;
+
+ return __kvm_valid_efer(vcpu, efer);
}
EXPORT_SYMBOL_GPL(kvm_valid_efer);
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 old_efer = vcpu->arch.efer;
+ u64 efer = msr_info->data;
- if (!kvm_valid_efer(vcpu, efer))
+ if (efer & efer_reserved_bits)
return 1;
- if (is_paging(vcpu)
- && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
- return 1;
+ if (!msr_info->host_initiated) {
+ if (!__kvm_valid_efer(vcpu, efer))
+ return 1;
+
+ if (is_paging(vcpu) &&
+ (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
+ }
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
@@ -1354,7 +1417,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
vcpu->arch.tsc_always_catchup = 1;
return 0;
} else {
- WARN(1, "user requested TSC rate below hardware speed\n");
+ pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
return -1;
}
}
@@ -1364,8 +1427,8 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
user_tsc_khz, tsc_khz);
if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
- WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
- user_tsc_khz);
+ pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
return -1;
}
@@ -2067,7 +2130,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
/* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
* workaround a BIOS/GART TBL issue on AMD K8s, ignore
@@ -2197,8 +2263,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (msr_info->host_initiated)
vcpu->arch.microcode_version = data;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.arch_capabilities = data;
+ break;
case MSR_EFER:
- return set_efer(vcpu, data);
+ return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
data &= ~(u64)0x100; /* ignore ignne emulation enable */
@@ -2430,7 +2501,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
data = vcpu->arch.mce_banks[offset];
break;
}
@@ -2473,6 +2547,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_UCODE_REV:
msr_info->data = vcpu->arch.microcode_version;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has_arch_capabilities(vcpu))
+ return 1;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -4598,6 +4678,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
@@ -5735,8 +5822,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
emulation_type))
return EMULATE_DONE;
- if (ctxt->have_exception && inject_emulated_exception(vcpu))
+ if (ctxt->have_exception) {
+ /*
+ * #UD should result in just EMULATION_FAILED, and trap-like
+ * exception should not be encountered during decode.
+ */
+ WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP);
+ inject_emulated_exception(vcpu);
return EMULATE_DONE;
+ }
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
return handle_emulation_failure(vcpu);
@@ -5801,12 +5896,13 @@ restart:
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE && ctxt->tf)
- kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
- exception_type(ctxt->exception.vector) == EXCPT_TRAP)
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_rip_write(vcpu, ctxt->eip);
+ if (r == EMULATE_DONE && ctxt->tf)
+ kvm_vcpu_do_singlestep(vcpu, &r);
__kvm_set_rflags(vcpu, ctxt->eflags);
+ }
/*
* For STI, interrupts are shadowed; so KVM_REQ_EVENT will
@@ -5908,17 +6004,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- if (vcpu->cpu != smp_processor_id())
+ if (vcpu->cpu != raw_smp_processor_id())
send_ipi = 1;
}
}
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) {
/*
@@ -6036,14 +6132,12 @@ static void kvm_set_mmio_spte_mask(void)
/* Set the present bit. */
mask |= 1ull;
-#ifdef CONFIG_X86_64
/*
* If reserved bit is not supported, clear the present bit to disable
* mmio page fault.
*/
if (maxphyaddr == 52)
mask &= ~1ull;
-#endif
kvm_mmu_set_mmio_spte_mask(mask);
}
@@ -6056,12 +6150,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
struct kvm_vcpu *vcpu;
int i;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
atomic_set(&kvm_guest_has_master_clock, 0);
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
}
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -7448,7 +7542,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_update_cpuid(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (!is_long_mode(vcpu) && is_pae(vcpu)) {
+ if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
mmu_reset_needed = 1;
}
@@ -7672,6 +7766,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
int r;
+ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
kvm_vcpu_mtrr_init(vcpu);
r = vcpu_load(vcpu);
if (r)
@@ -7712,7 +7807,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
- kvm_x86_ops->vcpu_free(vcpu);
+ kvm_arch_vcpu_free(vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -8028,6 +8123,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -8056,6 +8152,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return 0;
}
+int kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return kvm_mmu_post_init_vm(kvm);
+}
+
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
int r;
@@ -8162,6 +8263,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
}
EXPORT_SYMBOL_GPL(x86_set_memory_region);
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+ kvm_mmu_pre_destroy_vm(kvm);
+}
+
void kvm_arch_destroy_vm(struct kvm *kvm)
{
if (current->mm == kvm->mm) {
@@ -8213,6 +8319,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
{
int i;
+ /*
+ * Clear out the previous array pointers for the KVM_MR_MOVE case. The
+ * old arrays will be freed by __kvm_set_memory_region() if installing
+ * the new memslot is successful.
+ */
+ memset(&slot->arch, 0, sizeof(slot->arch));
+
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
struct kvm_lpage_info *linfo;
unsigned long ugfn;
@@ -8286,6 +8399,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
+ if (change == KVM_MR_MOVE)
+ return kvm_arch_create_memslot(kvm, memslot,
+ mem->memory_size >> PAGE_SHIFT);
+
return 0;
}