summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2016-03-08 12:13:39 +0100
committerSasha Levin <sasha.levin@oracle.com>2016-03-22 11:10:34 -0400
commiteac525506a083a389ba173880979a6291401af2d (patch)
tree975319a19392fa0f648a1f5360493bae6a484714 /arch
parent09b4fd2014b1ef7d46df8df553f94254ba2a0497 (diff)
KVM: MMU: fix ept=0/pte.u=1/pte.w=0/CR0.WP=0/CR4.SMEP=1/EFER.NX=0 combo
[ Upstream commit 844a5fe219cf472060315971e15cbf97674a3324 ] Yes, all of these are needed. :) This is admittedly a bit odd, but kvm-unit-tests access.flat tests this if you run it with "-cpu host" and of course ept=0. KVM runs the guest with CR0.WP=1, so it must handle supervisor writes specially when pte.u=1/pte.w=0/CR0.WP=0. Such writes cause a fault when U=1 and W=0 in the SPTE, but they must succeed because CR0.WP=0. When KVM gets the fault, it sets U=0 and W=1 in the shadow PTE and restarts execution. This will still cause a user write to fault, while supervisor writes will succeed. User reads will fault spuriously now, and KVM will then flip U and W again in the SPTE (U=1, W=0). User reads will be enabled and supervisor writes disabled, going back to the originary situation where supervisor writes fault spuriously. When SMEP is in effect, however, U=0 will enable kernel execution of this page. To avoid this, KVM also sets NX=1 in the shadow PTE together with U=0. If the guest has not enabled NX, the result is a continuous stream of page faults due to the NX bit being reserved. The fix is to force EFER.NX=1 even if the CPU is taking care of the EFER switch. (All machines with SMEP have the CPU_LOAD_IA32_EFER vm-entry control, so they do not use user-return notifiers for EFER---if they did, EFER.NX would be forced to the same value as the host). There is another bug in the reserved bit check, which I've split to a separate patch for easier application to stable kernels. Cc: stable@vger.kernel.org Cc: Andy Lutomirski <luto@amacapital.net> Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com> Fixes: f6577a5fa15d82217ca73c74cd2dcbc0f6c781dd Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/kvm/vmx.c36
1 files changed, 23 insertions, 13 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81cf77d2d5cf..917148620f49 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1718,26 +1718,31 @@ static void reload_tss(void)
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
- u64 guest_efer;
- u64 ignore_bits;
+ u64 guest_efer = vmx->vcpu.arch.efer;
+ u64 ignore_bits = 0;
- guest_efer = vmx->vcpu.arch.efer;
+ if (!enable_ept) {
+ /*
+ * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
+ * host CPUID is more efficient than testing guest CPUID
+ * or CR4. Host SMEP is anyway a requirement for guest SMEP.
+ */
+ if (boot_cpu_has(X86_FEATURE_SMEP))
+ guest_efer |= EFER_NX;
+ else if (!(guest_efer & EFER_NX))
+ ignore_bits |= EFER_NX;
+ }
/*
- * NX is emulated; LMA and LME handled by hardware; SCE meaningless
- * outside long mode
+ * LMA and LME handled by hardware; SCE meaningless outside long mode.
*/
- ignore_bits = EFER_NX | EFER_SCE;
+ ignore_bits |= EFER_SCE;
#ifdef CONFIG_X86_64
ignore_bits |= EFER_LMA | EFER_LME;
/* SCE is meaningful only in long mode on Intel */
if (guest_efer & EFER_LMA)
ignore_bits &= ~(u64)EFER_SCE;
#endif
- guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
- vmx->guest_msrs[efer_offset].data = guest_efer;
- vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
clear_atomic_switch_msr(vmx, MSR_EFER);
@@ -1748,16 +1753,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
*/
if (cpu_has_load_ia32_efer ||
(enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
- guest_efer = vmx->vcpu.arch.efer;
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
if (guest_efer != host_efer)
add_atomic_switch_msr(vmx, MSR_EFER,
guest_efer, host_efer);
return false;
- }
+ } else {
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
- return true;
+ vmx->guest_msrs[efer_offset].data = guest_efer;
+ vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+ return true;
+ }
}
static unsigned long segment_base(u16 selector)