summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-08-02 16:11:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-08-02 16:11:27 -0400
commit221bb8a46e230b9824204ae86537183d9991ff2a (patch)
tree92510d72285b2285be7cb87288bf088cb28af4c1 /arch/x86/kvm/vmx.c
parentf7b32e4c021fd788f13f6785e17efbc3eb05b351 (diff)
parent23528bb21ee2c9b27f3feddd77a2a3351a8df148 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: - ARM: GICv3 ITS emulation and various fixes. Removal of the old VGIC implementation. - s390: support for trapping software breakpoints, nested virtualization (vSIE), the STHYI opcode, initial extensions for CPU model support. - MIPS: support for MIPS64 hosts (32-bit guests only) and lots of cleanups, preliminary to this and the upcoming support for hardware virtualization extensions. - x86: support for execute-only mappings in nested EPT; reduced vmexit latency for TSC deadline timer (by about 30%) on Intel hosts; support for more than 255 vCPUs. - PPC: bugfixes. * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (302 commits) KVM: PPC: Introduce KVM_CAP_PPC_HTM MIPS: Select HAVE_KVM for MIPS64_R{2,6} MIPS: KVM: Reset CP0_PageMask during host TLB flush MIPS: KVM: Fix ptr->int cast via KVM_GUEST_KSEGX() MIPS: KVM: Sign extend MFC0/RDHWR results MIPS: KVM: Fix 64-bit big endian dynamic translation MIPS: KVM: Fail if ebase doesn't fit in CP0_EBase MIPS: KVM: Use 64-bit CP0_EBase when appropriate MIPS: KVM: Set CP0_Status.KX on MIPS64 MIPS: KVM: Make entry code MIPS64 friendly MIPS: KVM: Use kmap instead of CKSEG0ADDR() MIPS: KVM: Use virt_to_phys() to get commpage PFN MIPS: Fix definition of KSEGX() for 64-bit KVM: VMX: Add VMCS to CPU's loaded VMCSs before VMPTRLD kvm: x86: nVMX: maintain internal copy of current VMCS KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures KVM: arm64: vgic-its: Simplify MAPI error handling KVM: arm64: vgic-its: Make vgic_its_cmd_handle_mapi similar to other handlers KVM: arm64: vgic-its: Turn device_id validation into generic ID validation ...
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r--arch/x86/kvm/vmx.c386
1 files changed, 340 insertions, 46 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index df07a0a4611f..bc354f003ce1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -110,6 +110,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -398,6 +405,12 @@ struct nested_vmx {
/* The host-usable pointer to the above */
struct page *current_vmcs12_page;
struct vmcs12 *current_vmcs12;
+ /*
+ * Cache of the guest's VMCS, existing outside of guest memory.
+ * Loaded from guest memory during VMPTRLD. Flushed to guest
+ * memory during VMXOFF, VMCLEAR, VMPTRLD.
+ */
+ struct vmcs12 *cached_vmcs12;
struct vmcs *current_shadow_vmcs;
/*
* Indicates if the shadow vmcs must be updated with the
@@ -421,7 +434,6 @@ struct nested_vmx {
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
- u64 msr_ia32_feature_control;
struct hrtimer preemption_timer;
bool preemption_timer_expired;
@@ -597,11 +609,22 @@ struct vcpu_vmx {
#define PML_ENTITY_NUM 512
struct page *pml_pg;
+ /* apic deadline value in host tsc */
+ u64 hv_deadline_tsc;
+
u64 current_tsc_ratio;
bool guest_pkru_valid;
u32 guest_pkru;
u32 host_pkru;
+
+ /*
+ * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+ * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+ * in msr_ia32_feature_control_valid_bits.
+ */
+ u64 msr_ia32_feature_control;
+ u64 msr_ia32_feature_control_valid_bits;
};
enum segment_cache_field {
@@ -841,7 +864,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
- return to_vmx(vcpu)->nested.current_vmcs12;
+ return to_vmx(vcpu)->nested.cached_vmcs12;
}
static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
@@ -1056,6 +1079,58 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
}
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86 - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+ u32 eax = cpuid_eax(0x00000001), i;
+
+ /* Clear the reserved bits */
+ eax &= ~(0x3U << 14 | 0xfU << 28);
+ for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+ if (eax == vmx_preemption_cpu_tfms[i])
+ return true;
+
+ return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+ return vmcs_config.pin_based_exec_ctrl &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
static inline bool cpu_has_vmx_posted_intr(void)
{
return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@ -1603,6 +1678,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
__vmcs_writel(field, __vmcs_readl(field) | mask);
}
+static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+ vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+}
+
static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
{
vmcs_write32(VM_ENTRY_CONTROLS, val);
@@ -1631,6 +1711,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
}
+static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+ vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+}
+
static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
{
vmcs_write32(VM_EXIT_CONTROLS, val);
@@ -2121,22 +2206,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
if (!vmm_exclusive)
kvm_cpu_vmxon(phys_addr);
- else if (vmx->loaded_vmcs->cpu != cpu)
+ else if (!already_loaded)
loaded_vmcs_clear(vmx->loaded_vmcs);
- if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
- per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
- vmcs_load(vmx->loaded_vmcs->vmcs);
- }
-
- if (vmx->loaded_vmcs->cpu != cpu) {
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
- unsigned long sysenter_esp;
-
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ if (!already_loaded) {
local_irq_disable();
crash_disable_local_vmclear(cpu);
@@ -2151,6 +2228,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
&per_cpu(loaded_vmcss_on_cpu, cpu));
crash_enable_local_vmclear(cpu);
local_irq_enable();
+ }
+
+ if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+ }
+
+ if (!already_loaded) {
+ struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+ unsigned long sysenter_esp;
+
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
@@ -2716,6 +2805,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_INVEPT_BIT;
+ if (cpu_has_vmx_ept_execute_only())
+ vmx->nested.nested_vmx_ept_caps |=
+ VMX_EPT_EXECUTE_ONLY_BIT;
vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
/*
* For nested guests, we don't do anything specific
@@ -2864,6 +2956,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
return 0;
}
+static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+ uint64_t val)
+{
+ uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+
+ return !(val & ~valid_bits);
+}
+
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
@@ -2905,10 +3005,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
- case MSR_IA32_FEATURE_CONTROL:
- if (!nested_vmx_allowed(vcpu))
+ case MSR_IA32_MCG_EXT_CTL:
+ if (!msr_info->host_initiated &&
+ !(to_vmx(vcpu)->msr_ia32_feature_control &
+ FEATURE_CONTROL_LMCE))
return 1;
- msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+ msr_info->data = vcpu->arch.mcg_ext_ctl;
+ break;
+ case MSR_IA32_FEATURE_CONTROL:
+ msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
@@ -2998,12 +3103,20 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC_ADJUST:
ret = kvm_set_msr_common(vcpu, msr_info);
break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if ((!msr_info->host_initiated &&
+ !(to_vmx(vcpu)->msr_ia32_feature_control &
+ FEATURE_CONTROL_LMCE)) ||
+ (data & ~MCG_EXT_CTL_LMCE_EN))
+ return 1;
+ vcpu->arch.mcg_ext_ctl = data;
+ break;
case MSR_IA32_FEATURE_CONTROL:
- if (!nested_vmx_allowed(vcpu) ||
- (to_vmx(vcpu)->nested.msr_ia32_feature_control &
+ if (!vmx_feature_control_msr_valid(vcpu, data) ||
+ (to_vmx(vcpu)->msr_ia32_feature_control &
FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
return 1;
- vmx->nested.msr_ia32_feature_control = data;
+ vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
break;
@@ -3297,25 +3410,27 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmx_capability.ept, vmx_capability.vpid);
}
- min = VM_EXIT_SAVE_DEBUG_CONTROLS;
+ min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
#ifdef CONFIG_X86_64
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
- VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
+ VM_EXIT_CLEAR_BNDCFGS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control) < 0)
return -EIO;
min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+ opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
+ if (cpu_has_broken_vmx_preemption_timer())
+ _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
if (!(_cpu_based_2nd_exec_control &
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
- !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -3364,7 +3479,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
/*
* Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
- * but due to arrata below it can't be used. Workaround is to use
+ * but due to errata below it can't be used. Workaround is to use
* msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
*
* VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
@@ -4781,6 +4896,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
if (!kvm_vcpu_apicv_active(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+ /* Enable the preemption timer dynamically */
+ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
return pin_based_exec_ctrl;
}
@@ -4896,6 +5013,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
/* Control */
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+ vmx->hv_deadline_tsc = -1;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
@@ -6016,12 +6134,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- /* It is a write fault? */
- error_code = exit_qualification & PFERR_WRITE_MASK;
+ /* it is a read fault? */
+ error_code = (exit_qualification << 2) & PFERR_USER_MASK;
+ /* it is a write fault? */
+ error_code |= exit_qualification & PFERR_WRITE_MASK;
/* It is a fetch fault? */
error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
/* ept page table is present? */
- error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
+ error_code |= (exit_qualification & 0x38) != 0;
vcpu->arch.exit_qualification = exit_qualification;
@@ -6355,9 +6475,6 @@ static __init int hardware_setup(void)
for (msr = 0x800; msr <= 0x8ff; msr++)
vmx_disable_intercept_msr_read_x2apic(msr);
- /* According SDM, in x2apic mode, the whole id reg is used. But in
- * KVM, it only use the highest eight bits. Need to intercept it */
- vmx_enable_intercept_msr_read_x2apic(0x802);
/* TMCCT */
vmx_enable_intercept_msr_read_x2apic(0x839);
/* TPR */
@@ -6368,10 +6485,12 @@ static __init int hardware_setup(void)
vmx_disable_intercept_msr_write_x2apic(0x83f);
if (enable_ept) {
- kvm_mmu_set_mask_ptes(0ull,
+ kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK);
+ 0ull, VMX_EPT_EXECUTABLE_MASK,
+ cpu_has_vmx_ept_execute_only() ?
+ 0ull : VMX_EPT_READABLE_MASK);
ept_set_mmio_spte_mask();
kvm_enable_tdp();
} else
@@ -6393,8 +6512,21 @@ static __init int hardware_setup(void)
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+ u64 vmx_msr;
+
+ rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+ cpu_preemption_timer_multi =
+ vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ } else {
+ kvm_x86_ops->set_hv_timer = NULL;
+ kvm_x86_ops->cancel_hv_timer = NULL;
+ }
+
kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+ kvm_mce_cap_supported |= MCG_LMCE_P;
+
return alloc_kvm_area();
out8:
@@ -6862,16 +6994,22 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+ if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
kvm_inject_gp(vcpu, 0);
return 1;
}
+ vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_vmcs12)
+ return -ENOMEM;
+
if (enable_shadow_vmcs) {
shadow_vmcs = alloc_vmcs();
- if (!shadow_vmcs)
+ if (!shadow_vmcs) {
+ kfree(vmx->nested.cached_vmcs12);
return -ENOMEM;
+ }
/* mark vmcs as shadow */
shadow_vmcs->revision_id |= (1u << 31);
/* init shadow vmcs */
@@ -6942,6 +7080,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
vmx->nested.posted_intr_nv = -1;
+
+ /* Flush VMCS12 to guest memory */
+ memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
+ VMCS12_SIZE);
+
kunmap(vmx->nested.current_vmcs12_page);
nested_release_page(vmx->nested.current_vmcs12_page);
vmx->nested.current_vmptr = -1ull;
@@ -6962,6 +7105,7 @@ static void free_nested(struct vcpu_vmx *vmx)
nested_release_vmcs12(vmx);
if (enable_shadow_vmcs)
free_vmcs(vmx->nested.current_shadow_vmcs);
+ kfree(vmx->nested.cached_vmcs12);
/* Unpin physical memory we referred to in current vmcs02 */
if (vmx->nested.apic_access_page) {
nested_release_page(vmx->nested.apic_access_page);
@@ -7365,6 +7509,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
vmx->nested.current_vmptr = vmptr;
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
+ /*
+ * Load VMCS12 from guest memory since it is not already
+ * cached.
+ */
+ memcpy(vmx->nested.cached_vmcs12,
+ vmx->nested.current_vmcs12, VMCS12_SIZE);
+
if (enable_shadow_vmcs) {
vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
SECONDARY_EXEC_SHADOW_VMCS);
@@ -7560,6 +7711,12 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ kvm_lapic_expired_hv_timer(vcpu);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7610,6 +7767,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7918,6 +8076,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* the XSS exit bitmap in vmcs12.
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return false;
default:
return true;
}
@@ -8303,7 +8463,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
* the next L2->L1 exit.
*/
if (!is_guest_mode(vcpu) ||
- !nested_cpu_has2(vmx->nested.current_vmcs12,
+ !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
vmcs_write64(APIC_ACCESS_ADDR, hpa);
}
@@ -8436,7 +8596,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
"push %[sp]\n\t"
#endif
"pushf\n\t"
- "orl $0x200, (%%" _ASM_SP ")\n\t"
__ASM_SIZE(push) " $%c[cs]\n\t"
"call *%[entry]\n\t"
:
@@ -8449,8 +8608,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
[ss]"i"(__KERNEL_DS),
[cs]"i"(__KERNEL_CS)
);
- } else
- local_irq_enable();
+ }
}
static bool vmx_has_high_real_mode_segbase(void)
@@ -8601,6 +8759,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host);
}
+void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl;
+ u32 delta_tsc;
+
+ if (vmx->hv_deadline_tsc == -1)
+ return;
+
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* sure to be 32 bit only because checked on set_hv_timer */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
+
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+}
+
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8650,6 +8828,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
+ vmx_arm_hv_timer(vcpu);
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
@@ -8940,6 +9120,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->nested.current_vmptr = -1ull;
vmx->nested.current_vmcs12 = NULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+
return &vmx->vcpu;
free_vmcs:
@@ -9080,6 +9262,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (cpu_has_secondary_exec_ctrls())
vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+ if (nested_vmx_allowed(vcpu))
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9636,9 +9825,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write64(VMCS_LINK_POINTER, -1ull);
exec_control = vmcs12->pin_based_vm_exec_control;
- exec_control |= vmcs_config.pin_based_exec_ctrl;
+
+ /* Preemption timer setting is only taken from vmcs01. */
exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ exec_control |= vmcs_config.pin_based_exec_ctrl;
+ if (vmx->hv_deadline_tsc == -1)
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ /* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
/*
* Note that we use L0's vector here and in
@@ -10556,8 +10750,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
- vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
- vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
+ vm_entry_controls_reset_shadow(vmx);
+ vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
/* if no vmcs02 cache requested, remove the one we used */
@@ -10566,8 +10760,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
load_vmcs12_host_state(vcpu, vmcs12);
- /* Update TSC_OFFSET if TSC was changed while L2 ran */
+ /* Update any VMCS fields that might have changed while L2 ran */
vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+ if (vmx->hv_deadline_tsc == -1)
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ else
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
/* This is needed for same reason as it was needed in prepare_vmcs02 */
vmx->host_rsp = 0;
@@ -10647,6 +10847,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
return X86EMUL_CONTINUE;
}
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+ u64 divisor, u64 *result)
+{
+ u64 low = a << shift, high = a >> (64 - shift);
+
+ /* To avoid the overflow on divq */
+ if (high >= divisor)
+ return 1;
+
+ /* Low hold the result, high hold rem which is discarded */
+ asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+ "rm" (divisor), "0" (low), "1" (high));
+ *result = low;
+
+ return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl = rdtsc();
+ u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+
+ /* Convert to host delta tsc if tsc scaling is enabled */
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ u64_shl_div_u64(delta_tsc,
+ kvm_tsc_scaling_ratio_frac_bits,
+ vcpu->arch.tsc_scaling_ratio,
+ &delta_tsc))
+ return -ERANGE;
+
+ /*
+ * If the delta tsc can't fit in the 32 bit after the multi shift,
+ * we can't use the preemption timer.
+ * It's possible that it fits on later vmentries, but checking
+ * on every vmentry is costly so we just use an hrtimer.
+ */
+ if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+ return -ERANGE;
+
+ vmx->hv_deadline_tsc = tscl + delta_tsc;
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ vmx->hv_deadline_tsc = -1;
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+}
+#endif
+
static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
if (ple_gap)
@@ -10691,7 +10949,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
* this case, return 1, otherwise, return 0.
*
*/
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
+static int pi_pre_block(struct kvm_vcpu *vcpu)
{
unsigned long flags;
unsigned int dest;
@@ -10758,7 +11016,18 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
return 0;
}
-static void vmx_post_block(struct kvm_vcpu *vcpu)
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+ if (pi_pre_block(vcpu))
+ return 1;
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
+ return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
@@ -10800,6 +11069,14 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
}
}
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->set_hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
+
+ pi_post_block(vcpu);
+}
+
/*
* vmx_update_pi_irte - set IRTE for Posted-Interrupts
*
@@ -10844,7 +11121,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
* We will support full lowest-priority interrupt later.
*/
- kvm_set_msi_irq(e, &irq);
+ kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
/*
* Make sure the IRTE is in remapped mode if
@@ -10889,6 +11166,16 @@ out:
return ret;
}
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_LMCE;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEATURE_CONTROL_LMCE;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -11013,6 +11300,13 @@ static struct kvm_x86_ops vmx_x86_ops = {
.pmu_ops = &intel_pmu_ops,
.update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
};
static int __init vmx_init(void)