summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2015-12-19 11:49:13 +0100
committerThomas Gleixner <tglx@linutronix.de>2015-12-19 11:49:13 +0100
commit0fa85119cd480c1ded7a81ed64f723fe16a15355 (patch)
tree8648434601c5112a1d9ab091ab11507fe88e0962 /arch/x86
parentd6ccc3ec95251d8d3276f2900b59cbc468dd74f4 (diff)
parent1eab0e42450c6038e2bb17da438370fe639973f3 (diff)
Merge branch 'linus' into x86/cleanups
Pull in upstream changes so we can apply depending patches.
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig.debug1
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/boot/Makefile4
-rw-r--r--arch/x86/boot/boot.h1
-rw-r--r--arch/x86/boot/video-mode.c2
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/crypto/Makefile8
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S302
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c314
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S353
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c329
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c249
-rw-r--r--arch/x86/entry/entry_64.S19
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/i8259.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h10
-rw-r--r--arch/x86/include/asm/kvm_emulate.h10
-rw-r--r--arch/x86/include/asm/kvm_host.h65
-rw-r--r--arch/x86/include/asm/msr-index.h10
-rw-r--r--arch/x86/include/asm/page_types.h16
-rw-r--r--arch/x86/include/asm/pgtable.h10
-rw-r--r--arch/x86/include/asm/pgtable_types.h14
-rw-r--r--arch/x86/include/asm/vmx.h6
-rw-r--r--arch/x86/include/asm/x86_init.h1
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h5
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h18
-rw-r--r--arch/x86/include/uapi/asm/svm.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h4
-rw-r--r--arch/x86/kernel/acpi/boot.c22
-rw-r--r--arch/x86/kernel/apic/vector.c6
-rw-r--r--arch/x86/kernel/cpu/amd.c13
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.h5
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_msr.c7
-rw-r--r--arch/x86/kernel/fpu/signal.c11
-rw-r--r--arch/x86/kernel/fpu/xstate.c1
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--arch/x86/kernel/head_64.S8
-rw-r--r--arch/x86/kernel/i8259.c29
-rw-r--r--arch/x86/kernel/irq_work.c2
-rw-r--r--arch/x86/kernel/kvmclock.c46
-rw-r--r--arch/x86/kernel/livepatch.c9
-rw-r--r--arch/x86/kernel/mcount_64.S6
-rw-r--r--arch/x86/kernel/pci-dma.c2
-rw-r--r--arch/x86/kernel/pmem.c12
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kernel/signal.c17
-rw-r--r--arch/x86/kernel/smpboot.c9
-rw-r--r--arch/x86/kernel/verify_cpu.S12
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/assigned-dev.c62
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/cpuid.h37
-rw-r--r--arch/x86/kvm/emulate.c35
-rw-r--r--arch/x86/kvm/hyperv.c31
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/ioapic.c29
-rw-r--r--arch/x86/kvm/ioapic.h15
-rw-r--r--arch/x86/kvm/irq.c40
-rw-r--r--arch/x86/kvm/irq.h27
-rw-r--r--arch/x86/kvm/irq_comm.c129
-rw-r--r--arch/x86/kvm/lapic.c131
-rw-r--r--arch/x86/kvm/lapic.h7
-rw-r--r--arch/x86/kvm/mmu.c111
-rw-r--r--arch/x86/kvm/mmu.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h22
-rw-r--r--arch/x86/kvm/svm.c181
-rw-r--r--arch/x86/kvm/trace.h51
-rw-r--r--arch/x86/kvm/vmx.c814
-rw-r--r--arch/x86/kvm/x86.c442
-rw-r--r--arch/x86/mm/dump_pagetables.c19
-rw-r--r--arch/x86/mm/highmem_32.c14
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--arch/x86/mm/kasan_init_64.c2
-rw-r--r--arch/x86/mm/mpx.c53
-rw-r--r--arch/x86/net/bpf_jit_comp.c2
-rw-r--r--arch/x86/pci/acpi.c296
-rw-r--r--arch/x86/pci/bus_numa.c13
-rw-r--r--arch/x86/pci/common.c8
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/um/signal.c18
-rw-r--r--arch/x86/um/stub_32.S1
-rw-r--r--arch/x86/um/stub_64.S18
-rw-r--r--arch/x86/xen/enlighten.c15
-rw-r--r--arch/x86/xen/grant-table.c2
-rw-r--r--arch/x86/xen/mmu.c10
-rw-r--r--arch/x86/xen/p2m.c19
-rw-r--r--arch/x86/xen/setup.c9
-rw-r--r--arch/x86/xen/suspend.c20
101 files changed, 3512 insertions, 1215 deletions
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 3e0baf726eef..137dfa96aa14 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -113,7 +113,6 @@ config DEBUG_RODATA_TEST
config DEBUG_WX
bool "Warn on W+X mappings at boot"
depends on DEBUG_RODATA
- default y
select X86_PTDUMP_CORE
---help---
Generate a warning if any W+X mappings are found at boot.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2dfaa72260b4..4086abca0b32 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -171,9 +171,11 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
+sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
+sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 0d553e54171b..2ee62dba0373 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,13 +9,13 @@
# Changed by many, many contributors over the years.
#
+KASAN_SANITIZE := n
+
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
# The number is the same as you would ordinarily press at bootup.
-KASAN_SANITIZE := n
-
SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
targets := vmlinux.bin setup.bin setup.elf bzImage
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 0033e96c3f09..9011a88353de 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -23,7 +23,6 @@
#include <stdarg.h>
#include <linux/types.h>
#include <linux/edd.h>
-#include <asm/boot.h>
#include <asm/setup.h>
#include "bitops.h"
#include "ctype.h"
diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c
index aa8a96b052e3..95c7a818c0ed 100644
--- a/arch/x86/boot/video-mode.c
+++ b/arch/x86/boot/video-mode.c
@@ -19,6 +19,8 @@
#include "video.h"
#include "vesa.h"
+#include <uapi/asm/boot.h>
+
/*
* Common variables
*/
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 05111bb8d018..77780e386e9b 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -13,6 +13,8 @@
* Select video mode
*/
+#include <uapi/asm/boot.h>
+
#include "boot.h"
#include "video.h"
#include "vesa.h"
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9a2838cf0591..b9b912a44d61 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,6 +5,8 @@
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
+sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
+sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@@ -91,9 +93,15 @@ ifeq ($(avx2_supported),yes)
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
poly1305-x86_64-y += poly1305-avx2-x86_64.o
endif
+ifeq ($(sha1_ni_supported),yes)
+sha1-ssse3-y += sha1_ni_asm.o
+endif
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+ifeq ($(sha256_ni_supported),yes)
+sha256-ssse3-y += sha256_ni_asm.o
+endif
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 225be06edc80..4fe27e074194 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -330,7 +330,7 @@ ENDPROC(crc_pcl)
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
################################################################
-.section .rotata, "a", %progbits
+.section .rodata, "a", %progbits
.align 8
K_table:
.long 0x493c7d27, 0x00000001
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
new file mode 100644
index 000000000000..874a651b9e7d
--- /dev/null
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -0,0 +1,302 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define RSPSAVE %rax
+
+/* gcc conversion */
+#define FRAME_SIZE 32 /* space for 2x16 bytes */
+
+#define ABCD %xmm0
+#define E0 %xmm1 /* Need two E's b/c they ping pong */
+#define E1 %xmm2
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define SHUF_MASK %xmm7
+
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process. Once all blocks have
+ * been processed, the digest pointer is updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha1_ni_transform(uint32_t *digest, const void *data,
+ uint32_t numBlocks)
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+.text
+.align 32
+ENTRY(sha1_ni_transform)
+ mov %rsp, RSPSAVE
+ sub $FRAME_SIZE, %rsp
+ and $~0xF, %rsp
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /* load initial hash values */
+ pinsrd $3, 1*16(DIGEST_PTR), E0
+ movdqu 0*16(DIGEST_PTR), ABCD
+ pand UPPER_WORD_MASK(%rip), E0
+ pshufd $0x1B, ABCD, ABCD
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa E0, (0*16)(%rsp)
+ movdqa ABCD, (1*16)(%rsp)
+
+ /* Rounds 0-3 */
+ movdqu 0*16(DATA_PTR), MSG0
+ pshufb SHUF_MASK, MSG0
+ paddd MSG0, E0
+ movdqa ABCD, E1
+ sha1rnds4 $0, E0, ABCD
+
+ /* Rounds 4-7 */
+ movdqu 1*16(DATA_PTR), MSG1
+ pshufb SHUF_MASK, MSG1
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1rnds4 $0, E1, ABCD
+ sha1msg1 MSG1, MSG0
+
+ /* Rounds 8-11 */
+ movdqu 2*16(DATA_PTR), MSG2
+ pshufb SHUF_MASK, MSG2
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1rnds4 $0, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 12-15 */
+ movdqu 3*16(DATA_PTR), MSG3
+ pshufb SHUF_MASK, MSG3
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $0, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 16-19 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $0, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 20-23 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 24-27 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $1, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 28-31 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 32-35 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $1, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 36-39 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 40-43 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 44-47 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $2, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 48-51 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 52-55 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $2, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 56-59 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 60-63 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $3, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 64-67 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $3, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 68-71 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $3, E1, ABCD
+ pxor MSG1, MSG3
+
+ /* Rounds 72-75 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $3, E0, ABCD
+
+ /* Rounds 76-79 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1rnds4 $3, E1, ABCD
+
+ /* Add current hash values with previously saved */
+ sha1nexte (0*16)(%rsp), E0
+ paddd (1*16)(%rsp), ABCD
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ pshufd $0x1B, ABCD, ABCD
+ movdqu ABCD, 0*16(DIGEST_PTR)
+ pextrd $3, E0, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+ mov RSPSAVE, %rsp
+
+ ret
+ENDPROC(sha1_ni_transform)
+
+.data
+
+.align 64
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+UPPER_WORD_MASK:
+ .octa 0xFFFFFFFF000000000000000000000000
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 00212c32d4db..dd14616b7739 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -31,24 +31,11 @@
#include <crypto/sha1_base.h>
#include <asm/fpu/api.h>
+typedef void (sha1_transform_fn)(u32 *digest, const char *data,
+ unsigned int rounds);
-asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
- unsigned int rounds);
-#ifdef CONFIG_AS_AVX
-asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
- unsigned int rounds);
-#endif
-#ifdef CONFIG_AS_AVX2
-#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
-
-asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
- unsigned int rounds);
-#endif
-
-static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
-
-static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+static int sha1_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha1_transform_fn *sha1_xform)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
@@ -61,14 +48,14 @@ static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
sha1_base_do_update(desc, data, len,
- (sha1_block_fn *)sha1_transform_asm);
+ (sha1_block_fn *)sha1_xform);
kernel_fpu_end();
return 0;
}
-static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
+static int sha1_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha1_transform_fn *sha1_xform)
{
if (!irq_fpu_usable())
return crypto_sha1_finup(desc, data, len, out);
@@ -76,32 +63,37 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
if (len)
sha1_base_do_update(desc, data, len,
- (sha1_block_fn *)sha1_transform_asm);
- sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm);
+ (sha1_block_fn *)sha1_xform);
+ sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform);
kernel_fpu_end();
return sha1_base_finish(desc, out);
}
-/* Add padding and return the message digest. */
-static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
+asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- return sha1_ssse3_finup(desc, NULL, 0, out);
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_transform_ssse3);
}
-#ifdef CONFIG_AS_AVX2
-static void sha1_apply_transform_avx2(u32 *digest, const char *data,
- unsigned int rounds)
+static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
{
- /* Select the optimal transform based on data block size */
- if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
- sha1_transform_avx2(digest, data, rounds);
- else
- sha1_transform_avx(digest, data, rounds);
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_transform_ssse3);
+}
+
+/* Add padding and return the message digest. */
+static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_ssse3_finup(desc, NULL, 0, out);
}
-#endif
-static struct shash_alg alg = {
+static struct shash_alg sha1_ssse3_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_ssse3_update,
@@ -110,7 +102,7 @@ static struct shash_alg alg = {
.descsize = sizeof(struct sha1_state),
.base = {
.cra_name = "sha1",
- .cra_driver_name= "sha1-ssse3",
+ .cra_driver_name = "sha1-ssse3",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
@@ -118,8 +110,60 @@ static struct shash_alg alg = {
}
};
+static int register_sha1_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shash(&sha1_ssse3_alg);
+ return 0;
+}
+
+static void unregister_sha1_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shash(&sha1_ssse3_alg);
+}
+
#ifdef CONFIG_AS_AVX
-static bool __init avx_usable(void)
+asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_transform_avx);
+}
+
+static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_transform_avx);
+}
+
+static int sha1_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_avx_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_avx_update,
+ .final = sha1_avx_final,
+ .finup = sha1_avx_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-avx",
+ .cra_priority = 160,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static bool avx_usable(void)
{
if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
if (cpu_has_avx)
@@ -130,55 +174,197 @@ static bool __init avx_usable(void)
return true;
}
-#ifdef CONFIG_AS_AVX2
-static bool __init avx2_usable(void)
+static int register_sha1_avx(void)
+{
+ if (avx_usable())
+ return crypto_register_shash(&sha1_avx_alg);
+ return 0;
+}
+
+static void unregister_sha1_avx(void)
{
- if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) &&
- boot_cpu_has(X86_FEATURE_BMI2))
+ if (avx_usable())
+ crypto_unregister_shash(&sha1_avx_alg);
+}
+
+#else /* CONFIG_AS_AVX */
+static inline int register_sha1_avx(void) { return 0; }
+static inline void unregister_sha1_avx(void) { }
+#endif /* CONFIG_AS_AVX */
+
+
+#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX)
+#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
+ && boot_cpu_has(X86_FEATURE_BMI1)
+ && boot_cpu_has(X86_FEATURE_BMI2))
return true;
return false;
}
+
+static void sha1_apply_transform_avx2(u32 *digest, const char *data,
+ unsigned int rounds)
+{
+ /* Select the optimal transform based on data block size */
+ if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
+ sha1_transform_avx2(digest, data, rounds);
+ else
+ sha1_transform_avx(digest, data, rounds);
+}
+
+static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_apply_transform_avx2);
+}
+
+static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_apply_transform_avx2);
+}
+
+static int sha1_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_avx2_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_avx2_update,
+ .final = sha1_avx2_final,
+ .finup = sha1_avx2_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-avx2",
+ .cra_priority = 170,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int register_sha1_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shash(&sha1_avx2_alg);
+ return 0;
+}
+
+static void unregister_sha1_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shash(&sha1_avx2_alg);
+}
+
+#else
+static inline int register_sha1_avx2(void) { return 0; }
+static inline void unregister_sha1_avx2(void) { }
#endif
+
+#ifdef CONFIG_AS_SHA1_NI
+asmlinkage void sha1_ni_transform(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_ni_transform);
+}
+
+static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_ni_transform);
+}
+
+static int sha1_ni_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_ni_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_ni_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_ni_update,
+ .final = sha1_ni_final,
+ .finup = sha1_ni_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-ni",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int register_sha1_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ return crypto_register_shash(&sha1_ni_alg);
+ return 0;
+}
+
+static void unregister_sha1_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ crypto_unregister_shash(&sha1_ni_alg);
+}
+
+#else
+static inline int register_sha1_ni(void) { return 0; }
+static inline void unregister_sha1_ni(void) { }
#endif
static int __init sha1_ssse3_mod_init(void)
{
- char *algo_name;
+ if (register_sha1_ssse3())
+ goto fail;
- /* test for SSSE3 first */
- if (cpu_has_ssse3) {
- sha1_transform_asm = sha1_transform_ssse3;
- algo_name = "SSSE3";
+ if (register_sha1_avx()) {
+ unregister_sha1_ssse3();
+ goto fail;
}
-#ifdef CONFIG_AS_AVX
- /* allow AVX to override SSSE3, it's a little faster */
- if (avx_usable()) {
- sha1_transform_asm = sha1_transform_avx;
- algo_name = "AVX";
-#ifdef CONFIG_AS_AVX2
- /* allow AVX2 to override AVX, it's a little faster */
- if (avx2_usable()) {
- sha1_transform_asm = sha1_apply_transform_avx2;
- algo_name = "AVX2";
- }
-#endif
+ if (register_sha1_avx2()) {
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
+ goto fail;
}
-#endif
- if (sha1_transform_asm) {
- pr_info("Using %s optimized SHA-1 implementation\n", algo_name);
- return crypto_register_shash(&alg);
+ if (register_sha1_ni()) {
+ unregister_sha1_avx2();
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
+ goto fail;
}
- pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n");
+ return 0;
+fail:
return -ENODEV;
}
static void __exit sha1_ssse3_mod_fini(void)
{
- crypto_unregister_shash(&alg);
+ unregister_sha1_ni();
+ unregister_sha1_avx2();
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
}
module_init(sha1_ssse3_mod_init);
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
new file mode 100644
index 000000000000..748cdf21a938
--- /dev/null
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -0,0 +1,353 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define SHA256CONSTANTS %rax
+
+#define MSG %xmm0
+#define STATE0 %xmm1
+#define STATE1 %xmm2
+#define MSGTMP0 %xmm3
+#define MSGTMP1 %xmm4
+#define MSGTMP2 %xmm5
+#define MSGTMP3 %xmm6
+#define MSGTMP4 %xmm7
+
+#define SHUF_MASK %xmm8
+
+#define ABEF_SAVE %xmm9
+#define CDGH_SAVE %xmm10
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process. Once all blocks have
+ * been processed, the digest pointer is updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha256_ni_transform(uint32_t *digest, const void *data,
+ uint32_t numBlocks);
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+
+.text
+.align 32
+ENTRY(sha256_ni_transform)
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /*
+ * load initial hash values
+ * Need to reorder these appropriately
+ * DCBA, HGFE -> ABEF, CDGH
+ */
+ movdqu 0*16(DIGEST_PTR), STATE0
+ movdqu 1*16(DIGEST_PTR), STATE1
+
+ pshufd $0xB1, STATE0, STATE0 /* CDAB */
+ pshufd $0x1B, STATE1, STATE1 /* EFGH */
+ movdqa STATE0, MSGTMP4
+ palignr $8, STATE1, STATE0 /* ABEF */
+ pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+ lea K256(%rip), SHA256CONSTANTS
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa STATE0, ABEF_SAVE
+ movdqa STATE1, CDGH_SAVE
+
+ /* Rounds 0-3 */
+ movdqu 0*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP0
+ paddd 0*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 4-7 */
+ movdqu 1*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP1
+ paddd 1*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 8-11 */
+ movdqu 2*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP2
+ paddd 2*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 12-15 */
+ movdqu 3*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP3
+ paddd 3*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 16-19 */
+ movdqa MSGTMP0, MSG
+ paddd 4*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 20-23 */
+ movdqa MSGTMP1, MSG
+ paddd 5*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 24-27 */
+ movdqa MSGTMP2, MSG
+ paddd 6*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 28-31 */
+ movdqa MSGTMP3, MSG
+ paddd 7*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 32-35 */
+ movdqa MSGTMP0, MSG
+ paddd 8*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 36-39 */
+ movdqa MSGTMP1, MSG
+ paddd 9*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 40-43 */
+ movdqa MSGTMP2, MSG
+ paddd 10*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 44-47 */
+ movdqa MSGTMP3, MSG
+ paddd 11*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 48-51 */
+ movdqa MSGTMP0, MSG
+ paddd 12*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 52-55 */
+ movdqa MSGTMP1, MSG
+ paddd 13*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 56-59 */
+ movdqa MSGTMP2, MSG
+ paddd 14*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 60-63 */
+ movdqa MSGTMP3, MSG
+ paddd 15*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Add current hash values with previously saved */
+ paddd ABEF_SAVE, STATE0
+ paddd CDGH_SAVE, STATE1
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ pshufd $0x1B, STATE0, STATE0 /* FEBA */
+ pshufd $0xB1, STATE1, STATE1 /* DCHG */
+ movdqa STATE0, MSGTMP4
+ pblendw $0xF0, STATE1, STATE0 /* DCBA */
+ palignr $8, MSGTMP4, STATE1 /* HGFE */
+
+ movdqu STATE0, 0*16(DIGEST_PTR)
+ movdqu STATE1, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+
+ ret
+ENDPROC(sha256_ni_transform)
+
+.data
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 0e0e85aea634..5f4d6086dc59 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -42,19 +42,10 @@
asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
u64 rounds);
-#ifdef CONFIG_AS_AVX
-asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
- u64 rounds);
-#endif
-#ifdef CONFIG_AS_AVX2
-asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
- u64 rounds);
-#endif
-
-static void (*sha256_transform_asm)(u32 *, const char *, u64);
+typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds);
-static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+static int sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha256_transform_fn *sha256_xform)
{
struct sha256_state *sctx = shash_desc_ctx(desc);
@@ -67,14 +58,14 @@ static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_transform_asm);
+ (sha256_block_fn *)sha256_xform);
kernel_fpu_end();
return 0;
}
-static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
+static int sha256_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
{
if (!irq_fpu_usable())
return crypto_sha256_finup(desc, data, len, out);
@@ -82,20 +73,32 @@ static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
if (len)
sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_transform_asm);
- sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm);
+ (sha256_block_fn *)sha256_xform);
+ sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform);
kernel_fpu_end();
return sha256_base_finish(desc, out);
}
+static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_transform_ssse3);
+}
+
+static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
+}
+
/* Add padding and return the message digest. */
static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
{
return sha256_ssse3_finup(desc, NULL, 0, out);
}
-static struct shash_alg algs[] = { {
+static struct shash_alg sha256_ssse3_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = sha256_ssse3_update,
@@ -127,8 +130,75 @@ static struct shash_alg algs[] = { {
}
} };
+static int register_sha256_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(sha256_ssse3_algs,
+ ARRAY_SIZE(sha256_ssse3_algs));
+ return 0;
+}
+
+static void unregister_sha256_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(sha256_ssse3_algs,
+ ARRAY_SIZE(sha256_ssse3_algs));
+}
+
#ifdef CONFIG_AS_AVX
-static bool __init avx_usable(void)
+asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
+ u64 rounds);
+
+static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_transform_avx);
+}
+
+static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_avx);
+}
+
+static int sha256_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_avx_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_avx_update,
+ .final = sha256_avx_final,
+ .finup = sha256_avx_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-avx",
+ .cra_priority = 160,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_avx_update,
+ .final = sha256_avx_final,
+ .finup = sha256_avx_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-avx",
+ .cra_priority = 160,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static bool avx_usable(void)
{
if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
if (cpu_has_avx)
@@ -138,47 +208,216 @@ static bool __init avx_usable(void)
return true;
}
-#endif
-static int __init sha256_ssse3_mod_init(void)
+static int register_sha256_avx(void)
{
- /* test for SSSE3 first */
- if (cpu_has_ssse3)
- sha256_transform_asm = sha256_transform_ssse3;
+ if (avx_usable())
+ return crypto_register_shashes(sha256_avx_algs,
+ ARRAY_SIZE(sha256_avx_algs));
+ return 0;
+}
-#ifdef CONFIG_AS_AVX
- /* allow AVX to override SSSE3, it's a little faster */
- if (avx_usable()) {
-#ifdef CONFIG_AS_AVX2
- if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2))
- sha256_transform_asm = sha256_transform_rorx;
- else
+static void unregister_sha256_avx(void)
+{
+ if (avx_usable())
+ crypto_unregister_shashes(sha256_avx_algs,
+ ARRAY_SIZE(sha256_avx_algs));
+}
+
+#else
+static inline int register_sha256_avx(void) { return 0; }
+static inline void unregister_sha256_avx(void) { }
#endif
- sha256_transform_asm = sha256_transform_avx;
+
+#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
+asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
+ u64 rounds);
+
+static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_transform_rorx);
+}
+
+static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_rorx);
+}
+
+static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_avx2_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_avx2_update,
+ .final = sha256_avx2_final,
+ .finup = sha256_avx2_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-avx2",
+ .cra_priority = 170,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
}
-#endif
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_avx2_update,
+ .final = sha256_avx2_final,
+ .finup = sha256_avx2_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-avx2",
+ .cra_priority = 170,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
- if (sha256_transform_asm) {
-#ifdef CONFIG_AS_AVX
- if (sha256_transform_asm == sha256_transform_avx)
- pr_info("Using AVX optimized SHA-256 implementation\n");
-#ifdef CONFIG_AS_AVX2
- else if (sha256_transform_asm == sha256_transform_rorx)
- pr_info("Using AVX2 optimized SHA-256 implementation\n");
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+
+static int register_sha256_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shashes(sha256_avx2_algs,
+ ARRAY_SIZE(sha256_avx2_algs));
+ return 0;
+}
+
+static void unregister_sha256_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shashes(sha256_avx2_algs,
+ ARRAY_SIZE(sha256_avx2_algs));
+}
+
+#else
+static inline int register_sha256_avx2(void) { return 0; }
+static inline void unregister_sha256_avx2(void) { }
#endif
- else
+
+#ifdef CONFIG_AS_SHA256_NI
+asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
+ u64 rounds); /*unsigned int rounds);*/
+
+static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_ni_transform);
+}
+
+static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_ni_transform);
+}
+
+static int sha256_ni_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_ni_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_ni_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_ni_update,
+ .final = sha256_ni_final,
+ .finup = sha256_ni_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-ni",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_ni_update,
+ .final = sha256_ni_final,
+ .finup = sha256_ni_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-ni",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int register_sha256_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ return crypto_register_shashes(sha256_ni_algs,
+ ARRAY_SIZE(sha256_ni_algs));
+ return 0;
+}
+
+static void unregister_sha256_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ crypto_unregister_shashes(sha256_ni_algs,
+ ARRAY_SIZE(sha256_ni_algs));
+}
+
+#else
+static inline int register_sha256_ni(void) { return 0; }
+static inline void unregister_sha256_ni(void) { }
#endif
- pr_info("Using SSSE3 optimized SHA-256 implementation\n");
- return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+static int __init sha256_ssse3_mod_init(void)
+{
+ if (register_sha256_ssse3())
+ goto fail;
+
+ if (register_sha256_avx()) {
+ unregister_sha256_ssse3();
+ goto fail;
}
- pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+ if (register_sha256_avx2()) {
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
+ goto fail;
+ }
+
+ if (register_sha256_ni()) {
+ unregister_sha256_avx2();
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
+ goto fail;
+ }
+
+ return 0;
+fail:
return -ENODEV;
}
static void __exit sha256_ssse3_mod_fini(void)
{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+ unregister_sha256_ni();
+ unregister_sha256_avx2();
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
}
module_init(sha256_ssse3_mod_init);
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 0c8c38c101ac..34e5083d6f36 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -41,19 +41,11 @@
asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
u64 rounds);
-#ifdef CONFIG_AS_AVX
-asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
- u64 rounds);
-#endif
-#ifdef CONFIG_AS_AVX2
-asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
- u64 rounds);
-#endif
-static void (*sha512_transform_asm)(u64 *, const char *, u64);
+typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds);
-static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+static int sha512_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha512_transform_fn *sha512_xform)
{
struct sha512_state *sctx = shash_desc_ctx(desc);
@@ -66,14 +58,14 @@ static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_transform_asm);
+ (sha512_block_fn *)sha512_xform);
kernel_fpu_end();
return 0;
}
-static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
+static int sha512_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha512_transform_fn *sha512_xform)
{
if (!irq_fpu_usable())
return crypto_sha512_finup(desc, data, len, out);
@@ -81,20 +73,32 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
if (len)
sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_transform_asm);
- sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm);
+ (sha512_block_fn *)sha512_xform);
+ sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform);
kernel_fpu_end();
return sha512_base_finish(desc, out);
}
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha512_update(desc, data, len, sha512_transform_ssse3);
+}
+
+static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
+}
+
/* Add padding and return the message digest. */
static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
{
return sha512_ssse3_finup(desc, NULL, 0, out);
}
-static struct shash_alg algs[] = { {
+static struct shash_alg sha512_ssse3_algs[] = { {
.digestsize = SHA512_DIGEST_SIZE,
.init = sha512_base_init,
.update = sha512_ssse3_update,
@@ -126,8 +130,25 @@ static struct shash_alg algs[] = { {
}
} };
+static int register_sha512_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(sha512_ssse3_algs,
+ ARRAY_SIZE(sha512_ssse3_algs));
+ return 0;
+}
+
+static void unregister_sha512_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(sha512_ssse3_algs,
+ ARRAY_SIZE(sha512_ssse3_algs));
+}
+
#ifdef CONFIG_AS_AVX
-static bool __init avx_usable(void)
+asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
+ u64 rounds);
+static bool avx_usable(void)
{
if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
if (cpu_has_avx)
@@ -137,47 +158,185 @@ static bool __init avx_usable(void)
return true;
}
-#endif
-static int __init sha512_ssse3_mod_init(void)
+static int sha512_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- /* test for SSSE3 first */
- if (cpu_has_ssse3)
- sha512_transform_asm = sha512_transform_ssse3;
+ return sha512_update(desc, data, len, sha512_transform_avx);
+}
-#ifdef CONFIG_AS_AVX
- /* allow AVX to override SSSE3, it's a little faster */
- if (avx_usable()) {
-#ifdef CONFIG_AS_AVX2
- if (boot_cpu_has(X86_FEATURE_AVX2))
- sha512_transform_asm = sha512_transform_rorx;
- else
-#endif
- sha512_transform_asm = sha512_transform_avx;
+static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_avx);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha512_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_avx_algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_base_init,
+ .update = sha512_avx_update,
+ .final = sha512_avx_final,
+ .finup = sha512_avx_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-avx",
+ .cra_priority = 160,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
}
-#endif
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_base_init,
+ .update = sha512_avx_update,
+ .final = sha512_avx_final,
+ .finup = sha512_avx_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-avx",
+ .cra_priority = 160,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
- if (sha512_transform_asm) {
-#ifdef CONFIG_AS_AVX
- if (sha512_transform_asm == sha512_transform_avx)
- pr_info("Using AVX optimized SHA-512 implementation\n");
-#ifdef CONFIG_AS_AVX2
- else if (sha512_transform_asm == sha512_transform_rorx)
- pr_info("Using AVX2 optimized SHA-512 implementation\n");
+static int register_sha512_avx(void)
+{
+ if (avx_usable())
+ return crypto_register_shashes(sha512_avx_algs,
+ ARRAY_SIZE(sha512_avx_algs));
+ return 0;
+}
+
+static void unregister_sha512_avx(void)
+{
+ if (avx_usable())
+ crypto_unregister_shashes(sha512_avx_algs,
+ ARRAY_SIZE(sha512_avx_algs));
+}
+#else
+static inline int register_sha512_avx(void) { return 0; }
+static inline void unregister_sha512_avx(void) { }
#endif
- else
+
+#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
+asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
+ u64 rounds);
+
+static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha512_update(desc, data, len, sha512_transform_rorx);
+}
+
+static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_rorx);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha512_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_avx2_algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_base_init,
+ .update = sha512_avx2_update,
+ .final = sha512_avx2_final,
+ .finup = sha512_avx2_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-avx2",
+ .cra_priority = 170,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_base_init,
+ .update = sha512_avx2_update,
+ .final = sha512_avx2_final,
+ .finup = sha512_avx2_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-avx2",
+ .cra_priority = 170,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+
+static int register_sha512_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shashes(sha512_avx2_algs,
+ ARRAY_SIZE(sha512_avx2_algs));
+ return 0;
+}
+
+static void unregister_sha512_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shashes(sha512_avx2_algs,
+ ARRAY_SIZE(sha512_avx2_algs));
+}
+#else
+static inline int register_sha512_avx2(void) { return 0; }
+static inline void unregister_sha512_avx2(void) { }
#endif
- pr_info("Using SSSE3 optimized SHA-512 implementation\n");
- return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+static int __init sha512_ssse3_mod_init(void)
+{
+
+ if (register_sha512_ssse3())
+ goto fail;
+
+ if (register_sha512_avx()) {
+ unregister_sha512_ssse3();
+ goto fail;
}
- pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+ if (register_sha512_avx2()) {
+ unregister_sha512_avx();
+ unregister_sha512_ssse3();
+ goto fail;
+ }
+
+ return 0;
+fail:
return -ENODEV;
}
static void __exit sha512_ssse3_mod_fini(void)
{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+ unregister_sha512_avx2();
+ unregister_sha512_avx();
+ unregister_sha512_ssse3();
}
module_init(sha512_ssse3_mod_init);
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 53616ca03244..a55697d19824 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -509,6 +509,17 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
+
+ /*
+ * We need to tell lockdep that IRQs are off. We can't do this until
+ * we fix gsbase, and we should do it before enter_from_user_mode
+ * (which can take locks). Since TRACE_IRQS_OFF idempotent,
+ * the simplest way to handle it is to just call it twice if
+ * we enter from user mode. There's no reason to optimize this since
+ * TRACE_IRQS_OFF is a no-op if lockdep is off.
+ */
+ TRACE_IRQS_OFF
+
#ifdef CONFIG_CONTEXT_TRACKING
call enter_from_user_mode
#endif
@@ -1049,12 +1060,18 @@ ENTRY(error_entry)
SWAPGS
.Lerror_entry_from_usermode_after_swapgs:
+ /*
+ * We need to tell lockdep that IRQs are off. We can't do this until
+ * we fix gsbase, and we should do it before enter_from_user_mode
+ * (which can take locks).
+ */
+ TRACE_IRQS_OFF
#ifdef CONFIG_CONTEXT_TRACKING
call enter_from_user_mode
#endif
+ ret
.Lerror_entry_done:
-
TRACE_IRQS_OFF
ret
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index caa2c712d1e7..f17705e1332c 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -382,3 +382,4 @@
373 i386 shutdown sys_shutdown
374 i386 userfaultfd sys_userfaultfd
375 i386 membarrier sys_membarrier
+376 i386 mlock2 sys_mlock2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 278842fdf1f6..314a90bfc09c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -331,6 +331,7 @@
322 64 execveat stub_execveat
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
+325 common mlock2 sys_mlock2
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 04e9d023168f..1c0b43724ce3 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -68,7 +68,6 @@ void *kmap_atomic(struct page *page);
void __kunmap_atomic(void *kvaddr);
void *kmap_atomic_pfn(unsigned long pfn);
void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
-struct page *kmap_atomic_to_page(void *ptr);
#define flush_cache_kmaps() do { } while (0)
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index ccffa53750a8..39bcefc20de7 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,6 +60,7 @@ struct legacy_pic {
void (*mask_all)(void);
void (*restore_mask)(void);
void (*init)(int auto_eoi);
+ int (*probe)(void);
int (*irq_pending)(unsigned int irq);
void (*make_irq)(unsigned int irq);
};
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 046c7fb1ca43..a210eba2727c 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -33,6 +33,11 @@ enum irq_remap_cap {
IRQ_POSTING_CAP = 0,
};
+struct vcpu_data {
+ u64 pi_desc_addr; /* Physical address of PI Descriptor */
+ u32 vector; /* Guest vector of the interrupt */
+};
+
#ifdef CONFIG_IRQ_REMAP
extern bool irq_remapping_cap(enum irq_remap_cap cap);
@@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
return x86_vector_domain;
}
-struct vcpu_data {
- u64 pi_desc_addr; /* Physical address of PI Descriptor */
- u32 vector; /* Guest vector of the interrupt */
-};
-
#else /* CONFIG_IRQ_REMAP */
static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e16466ec473c..e9cd7befcb76 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -112,6 +112,16 @@ struct x86_emulate_ops {
struct x86_exception *fault);
/*
+ * read_phys: Read bytes of standard (non-emulated/special) memory.
+ * Used for descriptor reading.
+ * @addr: [IN ] Physical address from which to read.
+ * @val: [OUT] Value read from memory.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+ void *val, unsigned int bytes);
+
+ /*
* write_std: Write bytes of standard (non-emulated/special) memory.
* Used for descriptor writing.
* @addr: [IN ] Linear address to which to write.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3a36ee704c30..30cfd64295a0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -24,6 +24,7 @@
#include <linux/perf_event.h>
#include <linux/pvclock_gtod.h>
#include <linux/clocksource.h>
+#include <linux/irqbypass.h>
#include <asm/pvclock-abi.h>
#include <asm/desc.h>
@@ -176,6 +177,8 @@ enum {
*/
#define KVM_APIC_PV_EOI_PENDING 1
+struct kvm_kernel_irq_routing_entry;
+
/*
* We don't want allocation failures within the mmu code, so we preallocate
* enough memory for a single page fault in a cache.
@@ -374,6 +377,7 @@ struct kvm_mtrr {
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
u64 hv_vapic;
+ s64 runtime_offset;
};
struct kvm_vcpu_arch {
@@ -396,6 +400,7 @@ struct kvm_vcpu_arch {
u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
+ u64 eoi_exit_bitmap[4];
unsigned long apic_attention;
int32_t apic_arb_prio;
int mp_state;
@@ -500,6 +505,7 @@ struct kvm_vcpu_arch {
u32 virtual_tsc_mult;
u32 virtual_tsc_khz;
s64 ia32_tsc_adjust_msr;
+ u64 tsc_scaling_ratio;
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -573,6 +579,9 @@ struct kvm_vcpu_arch {
struct {
bool pv_unhalted;
} pv;
+
+ int pending_ioapic_eoi;
+ int pending_external_vector;
};
struct kvm_lpage_info {
@@ -683,6 +692,9 @@ struct kvm_arch {
u32 bsp_vcpu_id;
u64 disabled_quirks;
+
+ bool irqchip_split;
+ u8 nr_reserved_ioapic_pins;
};
struct kvm_vm_stat {
@@ -766,7 +778,7 @@ struct kvm_x86_ops {
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
- void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
+ void (*update_bp_intercept)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -819,10 +831,10 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- int (*vm_has_apicv)(struct kvm *kvm);
+ int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm *kvm, int isr);
- void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+ void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
@@ -833,7 +845,7 @@ struct kvm_x86_ops {
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
bool (*invpcid_supported)(void);
- void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
+ void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment);
void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -841,11 +853,9 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
- void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
- u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
@@ -887,6 +897,20 @@ struct kvm_x86_ops {
gfn_t offset, unsigned long mask);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
+
+ /*
+ * Architecture specific hooks for vCPU blocking due to
+ * HLT instruction.
+ * Returns for .pre_block():
+ * - 0 means continue to block the vCPU.
+ * - 1 means we cannot block the vCPU since some event
+ * happens during this period, such as, 'ON' bit in
+ * posted-interrupts descriptor is set.
+ */
+ int (*pre_block)(struct kvm_vcpu *vcpu);
+ void (*post_block)(struct kvm_vcpu *vcpu);
+ int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
};
struct kvm_arch_async_pf {
@@ -898,17 +922,6 @@ struct kvm_arch_async_pf {
extern struct kvm_x86_ops *kvm_x86_ops;
-static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
- s64 adjustment)
-{
- kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false);
-}
-
-static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
-{
- kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true);
-}
-
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
@@ -961,10 +974,12 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
/* control of guest tsc rate supported? */
extern bool kvm_has_tsc_control;
-/* minimum supported tsc_khz for guests */
-extern u32 kvm_min_guest_tsc_khz;
/* maximum supported tsc_khz for guests */
extern u32 kvm_max_guest_tsc_khz;
+/* number of bits of the fractional part of the TSC scaling ratio */
+extern u8 kvm_tsc_scaling_ratio_frac_bits;
+/* maximum allowed value of TSC scaling ratio */
+extern u64 kvm_max_tsc_scaling_ratio;
enum emulation_result {
EMULATE_DONE, /* no further processing */
@@ -1210,6 +1225,9 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
void kvm_define_shared_msr(unsigned index, u32 msr);
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
+
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
@@ -1231,4 +1249,13 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
+
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq);
+
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b8c14bb7fc8f..690b4027e17c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -35,7 +35,7 @@
#define MSR_IA32_PERFCTR0 0x000000c1
#define MSR_IA32_PERFCTR1 0x000000c2
#define MSR_FSB_FREQ 0x000000cd
-#define MSR_NHM_PLATFORM_INFO 0x000000ce
+#define MSR_PLATFORM_INFO 0x000000ce
#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2
#define NHM_C3_AUTO_DEMOTE (1UL << 25)
@@ -44,7 +44,6 @@
#define SNB_C1_AUTO_UNDEMOTE (1UL << 27)
#define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
-#define MSR_PLATFORM_INFO 0x000000ce
#define MSR_MTRRcap 0x000000fe
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
@@ -206,6 +205,13 @@
#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0
#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1
+/* Config TDP MSRs */
+#define MSR_CONFIG_TDP_NOMINAL 0x00000648
+#define MSR_CONFIG_TDP_LEVEL1 0x00000649
+#define MSR_CONFIG_TDP_LEVEL2 0x0000064A
+#define MSR_CONFIG_TDP_CONTROL 0x0000064B
+#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C
+
/* Hardware P state interface */
#define MSR_PPERF 0x0000064e
#define MSR_PERF_LIMIT_REASONS 0x0000064f
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index c5b7fb2774d0..cc071c6f7d4d 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -9,19 +9,21 @@
#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
+
+#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
+
#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
-/* Cast PAGE_MASK to a signed type so that it is sign-extended if
+/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
virtual addresses are 32-bits but physical addresses are larger
(ie, 32-bit PAE). */
#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
-
-#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
-#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
-
-#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
-#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
+#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK)
+#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK)
#define HPAGE_SHIFT PMD_SHIFT
#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e99cbe814ea8..d3eee663c41f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -322,6 +322,16 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
/*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index dd5b0aa9dd2f..a471cadb9630 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -279,17 +279,14 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
static inline pudval_t pud_pfn_mask(pud_t pud)
{
if (native_pud_val(pud) & _PAGE_PSE)
- return PUD_PAGE_MASK & PHYSICAL_PAGE_MASK;
+ return PHYSICAL_PUD_PAGE_MASK;
else
return PTE_PFN_MASK;
}
static inline pudval_t pud_flags_mask(pud_t pud)
{
- if (native_pud_val(pud) & _PAGE_PSE)
- return ~(PUD_PAGE_MASK & (pudval_t)PHYSICAL_PAGE_MASK);
- else
- return ~PTE_PFN_MASK;
+ return ~pud_pfn_mask(pud);
}
static inline pudval_t pud_flags(pud_t pud)
@@ -300,17 +297,14 @@ static inline pudval_t pud_flags(pud_t pud)
static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
{
if (native_pmd_val(pmd) & _PAGE_PSE)
- return PMD_PAGE_MASK & PHYSICAL_PAGE_MASK;
+ return PHYSICAL_PMD_PAGE_MASK;
else
return PTE_PFN_MASK;
}
static inline pmdval_t pmd_flags_mask(pmd_t pmd)
{
- if (native_pmd_val(pmd) & _PAGE_PSE)
- return ~(PMD_PAGE_MASK & (pmdval_t)PHYSICAL_PAGE_MASK);
- else
- return ~PTE_PFN_MASK;
+ return ~pmd_pfn_mask(pmd);
}
static inline pmdval_t pmd_flags(pmd_t pmd)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 448b7ca61aee..14c63c7e8337 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,7 +72,8 @@
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
-
+#define SECONDARY_EXEC_PCOMMIT 0x00200000
+#define SECONDARY_EXEC_TSC_SCALING 0x02000000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
#define PIN_BASED_NMI_EXITING 0x00000008
@@ -167,6 +168,8 @@ enum vmcs_field {
VMWRITE_BITMAP = 0x00002028,
XSS_EXIT_BITMAP = 0x0000202C,
XSS_EXIT_BITMAP_HIGH = 0x0000202D,
+ TSC_MULTIPLIER = 0x00002032,
+ TSC_MULTIPLIER_HIGH = 0x00002033,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
VMCS_LINK_POINTER = 0x00002800,
@@ -416,6 +419,7 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
+#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */
#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 10002a46c593..1ae89a2721d6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -1,7 +1,6 @@
#ifndef _ASM_X86_PLATFORM_H
#define _ASM_X86_PLATFORM_H
-#include <asm/pgtable_types.h>
#include <asm/bootparam.h>
struct mpc_bus;
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d866959e5685..8b2d4bea9962 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -57,4 +57,9 @@ static inline bool xen_x2apic_para_available(void)
}
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num);
+void xen_arch_unregister_cpu(int num);
+#endif
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 0679e11d2cf7..f5fb840b43e8 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -12,7 +12,7 @@
#include <asm/pgtable.h>
#include <xen/interface/xen.h>
-#include <xen/grant_table.h>
+#include <xen/interface/grant_table.h>
#include <xen/features.h>
/* Xen machine address */
@@ -43,6 +43,8 @@ extern unsigned long *xen_p2m_addr;
extern unsigned long xen_p2m_size;
extern unsigned long xen_max_p2m_pfn;
+extern int xen_alloc_p2m_entry(unsigned long pfn);
+
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
@@ -296,8 +298,8 @@ void make_lowmem_page_readwrite(void *vaddr);
#define xen_unmap(cookie) iounmap((cookie))
static inline bool xen_arch_need_swiotlb(struct device *dev,
- unsigned long pfn,
- unsigned long bfn)
+ phys_addr_t phys,
+ dma_addr_t dev_addr)
{
return false;
}
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index f0412c50c47b..040d4083c24f 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -153,6 +153,12 @@
/* MSR used to provide vcpu index */
#define HV_X64_MSR_VP_INDEX 0x40000002
+/* MSR used to reset the guest OS. */
+#define HV_X64_MSR_RESET 0x40000003
+
+/* MSR used to provide vcpu runtime in 100ns units */
+#define HV_X64_MSR_VP_RUNTIME 0x40000010
+
/* MSR used to read the per-partition time reference counter */
#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
@@ -251,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
__s64 tsc_offset;
} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
+/* Define the number of synthetic interrupt sources. */
+#define HV_SYNIC_SINT_COUNT (16)
+/* Define the expected SynIC version. */
+#define HV_SYNIC_VERSION_1 (0x1)
+
+#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
+#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
+#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0)
+#define HV_SYNIC_SINT_MASKED (1ULL << 16)
+#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17)
+#define HV_SYNIC_SINT_VECTOR_MASK (0xFF)
+
#endif
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index b5d7640abc5d..8a4add8e4639 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -100,6 +100,7 @@
{ SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
{ SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
{ SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
+ { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \
{ SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
{ SVM_EXIT_INTR, "interrupt" }, \
{ SVM_EXIT_NMI, "nmi" }, \
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 37fee272618f..5b15d94a33f8 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -78,6 +78,7 @@
#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
+#define EXIT_REASON_PCOMMIT 65
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -126,7 +127,8 @@
{ EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
- { EXIT_REASON_XRSTORS, "XRSTORS" }
+ { EXIT_REASON_XRSTORS, "XRSTORS" }, \
+ { EXIT_REASON_PCOMMIT, "PCOMMIT" }
#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ded848c20e05..e75907601a41 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -976,6 +976,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
{
int count;
int x2count = 0;
+ int ret;
+ struct acpi_subtable_proc madt_proc[2];
if (!cpu_has_apic)
return -ENODEV;
@@ -999,10 +1001,22 @@ static int __init acpi_parse_madt_lapic_entries(void)
acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
- x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
- acpi_parse_x2apic, MAX_LOCAL_APIC);
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
- acpi_parse_lapic, MAX_LOCAL_APIC);
+ memset(madt_proc, 0, sizeof(madt_proc));
+ madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
+ madt_proc[0].handler = acpi_parse_lapic;
+ madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
+ madt_proc[1].handler = acpi_parse_x2apic;
+ ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
+ sizeof(struct acpi_table_madt),
+ madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
+ if (ret < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing LAPIC/X2APIC entries\n");
+ return ret;
+ }
+
+ x2count = madt_proc[0].count;
+ count = madt_proc[1].count;
}
if (!count && !x2count) {
printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 836d11b92811..861bc59c8f25 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -361,7 +361,11 @@ int __init arch_probe_nr_irqs(void)
if (nr < nr_irqs)
nr_irqs = nr;
- return nr_legacy_irqs();
+ /*
+ * We don't know if PIC is present at this point so we need to do
+ * probe() to get the right number of legacy IRQs.
+ */
+ return legacy_pic->probe();
}
#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 4a70fc6d400a..a8816b325162 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -352,6 +352,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
#ifdef CONFIG_SMP
unsigned bits;
int cpu = smp_processor_id();
+ unsigned int socket_id, core_complex_id;
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
@@ -361,6 +362,18 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
amd_get_topology(c);
+
+ /*
+ * Fix percpu cpu_llc_id here as LLC topology is different
+ * for Fam17h systems.
+ */
+ if (c->x86 != 0x17 || !cpuid_edx(0x80000006))
+ return;
+
+ socket_id = (c->apicid >> bits) - 1;
+ core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3;
+
+ per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id;
#endif
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4ddd780aeac9..c2b7522cbf35 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -273,10 +273,9 @@ __setup("nosmap", setup_disable_smap);
static __always_inline void setup_smap(struct cpuinfo_x86 *c)
{
- unsigned long eflags;
+ unsigned long eflags = native_save_fl();
/* This should have been cleared long ago */
- raw_local_save_flags(eflags);
BUG_ON(eflags & X86_EFLAGS_AC);
if (cpu_has(c, X86_FEATURE_SMAP)) {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 98a13db5f4be..209ac1e7d1f0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -97,6 +97,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case 0x27: /* Penwell */
case 0x35: /* Cloverview */
+ case 0x4a: /* Merrifield */
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
break;
default:
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 7fc27f1cca58..b3e94ef461fd 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -698,3 +698,4 @@ int __init microcode_init(void)
return error;
}
+late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4562cf070c27..2bf79d7c97df 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -5,7 +5,7 @@
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
* Copyright (C) 2009 Google, Inc., Stephane Eranian
*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 499f533dd3cc..d0e35ebb2adb 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -5,7 +5,7 @@
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
* Copyright (C) 2009 Google, Inc., Stephane Eranian
*
@@ -387,7 +387,7 @@ struct cpu_hw_events {
/* Check flags and event code/umask, and set the HSW N/A flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
__EVENT_CONSTRAINT(code, n, \
- INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
@@ -627,6 +627,7 @@ struct x86_perf_task_context {
u64 lbr_from[MAX_LBR_ENTRIES];
u64 lbr_to[MAX_LBR_ENTRIES];
u64 lbr_info[MAX_LBR_ENTRIES];
+ int tos;
int lbr_callstack_users;
int lbr_stack_state;
};
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index f63360be2238..e2a430021e46 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -232,7 +232,7 @@ static struct event_constraint intel_hsw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
- INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 377e8f8ed391..a316ca96f1b6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
{
if (event->attach_state & PERF_ATTACH_TASK)
- return perf_cgroup_from_task(event->hw.target);
+ return perf_cgroup_from_task(event->hw.target, event->ctx);
return event->cgrp;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index bfd0b717e944..659f01e165d5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -239,7 +239,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
}
mask = x86_pmu.lbr_nr - 1;
- tos = intel_pmu_lbr_tos();
+ tos = task_ctx->tos;
for (i = 0; i < tos; i++) {
lbr_idx = (tos - i) & mask;
wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
@@ -247,6 +247,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
+ wrmsrl(x86_pmu.lbr_tos, tos);
task_ctx->lbr_stack_state = LBR_NONE;
}
@@ -270,6 +271,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
+ task_ctx->tos = tos;
task_ctx->lbr_stack_state = LBR_VALID;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 81431c0f0614..ed446bdcbf31 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -107,12 +107,6 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-#define RAPL_EVENT_DESC(_name, _config) \
-{ \
- .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
- .config = _config, \
-}
-
#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
#define RAPL_EVENT_ATTR_STR(_name, v, str) \
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
index f32ac13934f2..ec863b9a9f78 100644
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -163,10 +163,9 @@ again:
goto again;
delta = now - prev;
- if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
- delta <<= 32;
- delta >>= 32; /* sign extend */
- }
+ if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+ delta = sign_extend64(delta, 31);
+
local64_add(now - prev, &event->count);
}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index ef29b742cea7..31c6a60505e6 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -385,20 +385,19 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
*/
void fpu__init_prepare_fx_sw_frame(void)
{
- int fsave_header_size = sizeof(struct fregs_state);
int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
- if (config_enabled(CONFIG_X86_32))
- size += fsave_header_size;
-
fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
fx_sw_reserved.extended_size = size;
fx_sw_reserved.xfeatures = xfeatures_mask;
fx_sw_reserved.xstate_size = xstate_size;
- if (config_enabled(CONFIG_IA32_EMULATION)) {
+ if (config_enabled(CONFIG_IA32_EMULATION) ||
+ config_enabled(CONFIG_X86_32)) {
+ int fsave_header_size = sizeof(struct fregs_state);
+
fx_sw_reserved_ia32 = fx_sw_reserved;
- fx_sw_reserved_ia32.extended_size += fsave_header_size;
+ fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
}
}
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6454f2731b56..70fc312221fc 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -694,7 +694,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return NULL;
- xsave = &current->thread.fpu.state.xsave;
/*
* We should not ever be requesting features that we
* have not enabled. Remember that pcntxt_mask is
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8b7b0a51e742..311bcf338f07 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -556,6 +556,7 @@ void ftrace_replace_code(int enable)
run_sync();
report = "updating code";
+ count = 0;
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
@@ -563,11 +564,13 @@ void ftrace_replace_code(int enable)
ret = add_update(rec, enable);
if (ret)
goto remove_breakpoints;
+ count++;
}
run_sync();
report = "removing breakpoints";
+ count = 0;
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
@@ -575,6 +578,7 @@ void ftrace_replace_code(int enable)
ret = finish_update(rec, enable);
if (ret)
goto remove_breakpoints;
+ count++;
}
run_sync();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 1d40ca8a73f2..ffdc0e860390 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -65,6 +65,9 @@ startup_64:
* tables and then reload them.
*/
+ /* Sanitize CPU configuration */
+ call verify_cpu
+
/*
* Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
@@ -174,6 +177,9 @@ ENTRY(secondary_startup_64)
* after the boot processor executes this code.
*/
+ /* Sanitize CPU configuration */
+ call verify_cpu
+
movq $(init_level4_pgt - __START_KERNEL_map), %rax
1:
@@ -288,6 +294,8 @@ ENTRY(secondary_startup_64)
pushq %rax # target address in negative space
lretq
+#include "verify_cpu.S"
+
#ifdef CONFIG_HOTPLUG_CPU
/*
* Boot CPU0 entry point. It's called from play_dead(). Everything has been set
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 16cb827a5b27..be22f5a2192e 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -295,16 +295,11 @@ static void unmask_8259A(void)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-static void init_8259A(int auto_eoi)
+static int probe_8259A(void)
{
unsigned long flags;
unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
unsigned char new_val;
-
- i8259A_auto_eoi = auto_eoi;
-
- raw_spin_lock_irqsave(&i8259A_lock, flags);
-
/*
* Check to see if we have a PIC.
* Mask all except the cascade and read
@@ -312,16 +307,28 @@ static void init_8259A(int auto_eoi)
* have a PIC, we will read 0xff as opposed to the
* value we wrote.
*/
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
outb(probe_val, PIC_MASTER_IMR);
new_val = inb(PIC_MASTER_IMR);
if (new_val != probe_val) {
printk(KERN_INFO "Using NULL legacy PIC\n");
legacy_pic = &null_legacy_pic;
- raw_spin_unlock_irqrestore(&i8259A_lock, flags);
- return;
}
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return nr_legacy_irqs();
+}
+
+static void init_8259A(int auto_eoi)
+{
+ unsigned long flags;
+
+ i8259A_auto_eoi = auto_eoi;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
/*
@@ -379,6 +386,10 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
{
return 0;
}
+static int legacy_pic_probe(void)
+{
+ return 0;
+}
struct legacy_pic null_legacy_pic = {
.nr_legacy_irqs = 0,
@@ -388,6 +399,7 @@ struct legacy_pic null_legacy_pic = {
.mask_all = legacy_pic_noop,
.restore_mask = legacy_pic_noop,
.init = legacy_pic_int_noop,
+ .probe = legacy_pic_probe,
.irq_pending = legacy_pic_irq_pending_noop,
.make_irq = legacy_pic_uint_noop,
};
@@ -400,6 +412,7 @@ struct legacy_pic default_legacy_pic = {
.mask_all = mask_8259A,
.restore_mask = unmask_8259A,
.init = init_8259A,
+ .probe = probe_8259A,
.irq_pending = i8259A_irq_pending,
.make_irq = make_8259A_irq,
};
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index dc5fa6a1e8d6..3512ba607361 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -1,7 +1,7 @@
/*
* x86 specific code for irq_work
*
- * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
*/
#include <linux/kernel.h>
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2c7aafa70702..2bd81e302427 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -32,6 +32,7 @@
static int kvmclock = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+static cycle_t kvm_sched_clock_offset;
static int parse_no_kvmclock(char *arg)
{
@@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
return kvm_clock_read();
}
+static cycle_t kvm_sched_clock_read(void)
+{
+ return kvm_clock_read() - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+ if (!stable) {
+ pv_time_ops.sched_clock = kvm_clock_read;
+ return;
+ }
+
+ kvm_sched_clock_offset = kvm_clock_read();
+ pv_time_ops.sched_clock = kvm_sched_clock_read;
+ set_sched_clock_stable();
+
+ printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
+ kvm_sched_clock_offset);
+
+ BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+ sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
@@ -248,7 +272,17 @@ void __init kvmclock_init(void)
memblock_free(mem, size);
return;
}
- pv_time_ops.sched_clock = kvm_clock_read;
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+ cpu = get_cpu();
+ vcpu_time = &hv_clock[cpu].pvti;
+ flags = pvclock_read_flags(vcpu_time);
+
+ kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+ put_cpu();
+
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
@@ -265,16 +299,6 @@ void __init kvmclock_init(void)
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
-
- if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
- pvclock_set_flags(~0);
-
- cpu = get_cpu();
- vcpu_time = &hv_clock[cpu].pvti;
- flags = pvclock_read_flags(vcpu_time);
- if (flags & PVCLOCK_COUNTS_FROM_ZERO)
- set_sched_clock_stable();
- put_cpu();
}
int __init kvm_setup_vsyscall_timeinfo(void)
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
index ff3c3101d003..d1d35ccffed3 100644
--- a/arch/x86/kernel/livepatch.c
+++ b/arch/x86/kernel/livepatch.c
@@ -42,7 +42,6 @@ int klp_write_module_reloc(struct module *mod, unsigned long type,
bool readonly;
unsigned long val;
unsigned long core = (unsigned long)mod->module_core;
- unsigned long core_ro_size = mod->core_ro_size;
unsigned long core_size = mod->core_size;
switch (type) {
@@ -70,10 +69,12 @@ int klp_write_module_reloc(struct module *mod, unsigned long type,
/* loc does not point to any symbol inside the module */
return -EINVAL;
- if (loc < core + core_ro_size)
+ readonly = false;
+
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+ if (loc < core + mod->core_ro_size)
readonly = true;
- else
- readonly = false;
+#endif
/* determine if the relocation spans a page boundary */
numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index 94ea120fa21f..87e1762e2bca 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -278,6 +278,12 @@ trace:
/* save_mcount_regs fills in first two parameters */
save_mcount_regs
+ /*
+ * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not
+ * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the
+ * ip and parent ip are used and the list function is called when
+ * function tracing is enabled.
+ */
call *ftrace_trace_function
restore_mcount_regs
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index cd99433b8ba1..6ba014c61d62 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
again:
page = NULL;
/* CMA can be used only in the context which permits sleeping */
- if (flag & __GFP_WAIT) {
+ if (gfpflags_allow_blocking(flag)) {
page = dma_alloc_from_contiguous(dev, count, get_order(size));
if (page && page_to_phys(page) + size > dma_mask) {
dma_release_from_contiguous(dev, page, count);
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 4f00b63d7ff3..14415aff1813 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -4,10 +4,22 @@
*/
#include <linux/platform_device.h>
#include <linux/module.h>
+#include <linux/ioport.h>
+
+static int found(u64 start, u64 end, void *data)
+{
+ return 1;
+}
static __init int register_e820_pmem(void)
{
+ char *pmem = "Persistent Memory (legacy)";
struct platform_device *pdev;
+ int rc;
+
+ rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found);
+ if (rc <= 0)
+ return 0;
/*
* See drivers/nvdimm/e820.c for the implementation, this is
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a1e4da98c8f0..d2bbe343fda7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1188,7 +1188,7 @@ void __init setup_arch(char **cmdline_p)
*/
clone_pgd_range(initial_page_table,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- KERNEL_PGD_PTRS);
+ min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
#endif
tboot_probe();
@@ -1250,8 +1250,6 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled(EFI_BOOT))
efi_apply_memmap_quirks();
#endif
-
- microcode_init();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b7ffb7c00075..cb6282c3638f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -690,12 +690,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
signal_setup_done(failed, ksig, stepping);
}
-#ifdef CONFIG_X86_32
-#define NR_restart_syscall __NR_restart_syscall
-#else /* !CONFIG_X86_32 */
-#define NR_restart_syscall \
- test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
-#endif /* CONFIG_X86_32 */
+static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
+{
+#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64)
+ return __NR_restart_syscall;
+#else /* !CONFIG_X86_32 && CONFIG_X86_64 */
+ return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall :
+ __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */
+}
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
@@ -724,7 +727,7 @@ void do_signal(struct pt_regs *regs)
break;
case -ERESTART_RESTARTBLOCK:
- regs->ax = NR_restart_syscall;
+ regs->ax = get_nr_restart_syscall(regs);
regs->ip -= 2;
break;
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4df777710ab7..f2281e9cfdbe 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid)
*/
#define UDELAY_10MS_DEFAULT 10000
-static unsigned int init_udelay = INT_MAX;
+static unsigned int init_udelay = UINT_MAX;
static int __init cpu_init_udelay(char *str)
{
@@ -522,14 +522,15 @@ early_param("cpu_init_udelay", cpu_init_udelay);
static void __init smp_quirk_init_udelay(void)
{
/* if cmdline changed it from default, leave it alone */
- if (init_udelay != INT_MAX)
+ if (init_udelay != UINT_MAX)
return;
/* if modern processor, use no delay */
if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
- ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF)))
+ ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
init_udelay = 0;
-
+ return;
+ }
/* else, use legacy delay */
init_udelay = UDELAY_10MS_DEFAULT;
}
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index b9242bacbe59..4cf401f581e7 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -34,10 +34,11 @@
#include <asm/msr-index.h>
verify_cpu:
- pushfl # Save caller passed flags
- pushl $0 # Kill any dangerous flags
- popfl
+ pushf # Save caller passed flags
+ push $0 # Kill any dangerous flags
+ popf
+#ifndef __x86_64__
pushfl # standard way to check for cpuid
popl %eax
movl %eax,%ebx
@@ -48,6 +49,7 @@ verify_cpu:
popl %eax
cmpl %eax,%ebx
jz verify_cpu_no_longmode # cpu has no cpuid
+#endif
movl $0x0,%eax # See if cpuid 1 is implemented
cpuid
@@ -130,10 +132,10 @@ verify_cpu_sse_test:
jmp verify_cpu_sse_test # try again
verify_cpu_no_longmode:
- popfl # Restore caller passed flags
+ popf # Restore caller passed flags
movl $1,%eax
ret
verify_cpu_sse_ok:
- popfl # Restore caller passed flags
+ popf # Restore caller passed flags
xorl %eax, %eax
ret
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index d8a1d56276e1..639a6e34500c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@ config KVM
select ANON_INODES
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
+ select IRQ_BYPASS_MANAGER
+ select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index d090ecf08809..9dc091acd5fb 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -21,6 +21,7 @@
#include <linux/fs.h>
#include "irq.h"
#include "assigned-dev.h"
+#include "trace/events/kvm.h"
struct kvm_assigned_dev_kernel {
struct kvm_irq_ack_notifier ack_notifier;
@@ -131,7 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
return IRQ_HANDLED;
}
-#ifdef __KVM_HAVE_MSI
+/*
+ * Deliver an IRQ in an atomic context if we can, or return a failure,
+ * user can retry in a process context.
+ * Return value:
+ * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
+ * Other values - No need to retry.
+ */
+static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
+ int level)
+{
+ struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
+ struct kvm_kernel_irq_routing_entry *e;
+ int ret = -EINVAL;
+ int idx;
+
+ trace_kvm_set_irq(irq, level, irq_source_id);
+
+ /*
+ * Injection into either PIC or IOAPIC might need to scan all CPUs,
+ * which would need to be retried from thread context; when same GSI
+ * is connected to both PIC and IOAPIC, we'd have to report a
+ * partial failure here.
+ * Since there's no easy way to do this, we only support injecting MSI
+ * which is limited to 1:1 GSI mapping.
+ */
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
+ e = &entries[0];
+ ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
+ irq, level);
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+
static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -150,9 +186,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
return IRQ_HANDLED;
}
-#endif
-#ifdef __KVM_HAVE_MSIX
static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -183,7 +217,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
return IRQ_HANDLED;
}
-#endif
/* Ack the irq line for an assigned device */
static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
@@ -386,7 +419,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
return 0;
}
-#ifdef __KVM_HAVE_MSI
static int assigned_device_enable_host_msi(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
@@ -408,9 +440,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
return 0;
}
-#endif
-#ifdef __KVM_HAVE_MSIX
static int assigned_device_enable_host_msix(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
@@ -443,8 +473,6 @@ err:
return r;
}
-#endif
-
static int assigned_device_enable_guest_intx(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
@@ -454,7 +482,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm,
return 0;
}
-#ifdef __KVM_HAVE_MSI
static int assigned_device_enable_guest_msi(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
@@ -463,9 +490,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
dev->ack_notifier.gsi = -1;
return 0;
}
-#endif
-#ifdef __KVM_HAVE_MSIX
static int assigned_device_enable_guest_msix(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
@@ -474,7 +499,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
dev->ack_notifier.gsi = -1;
return 0;
}
-#endif
static int assign_host_irq(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
@@ -492,16 +516,12 @@ static int assign_host_irq(struct kvm *kvm,
case KVM_DEV_IRQ_HOST_INTX:
r = assigned_device_enable_host_intx(kvm, dev);
break;
-#ifdef __KVM_HAVE_MSI
case KVM_DEV_IRQ_HOST_MSI:
r = assigned_device_enable_host_msi(kvm, dev);
break;
-#endif
-#ifdef __KVM_HAVE_MSIX
case KVM_DEV_IRQ_HOST_MSIX:
r = assigned_device_enable_host_msix(kvm, dev);
break;
-#endif
default:
r = -EINVAL;
}
@@ -534,16 +554,12 @@ static int assign_guest_irq(struct kvm *kvm,
case KVM_DEV_IRQ_GUEST_INTX:
r = assigned_device_enable_guest_intx(kvm, dev, irq);
break;
-#ifdef __KVM_HAVE_MSI
case KVM_DEV_IRQ_GUEST_MSI:
r = assigned_device_enable_guest_msi(kvm, dev, irq);
break;
-#endif
-#ifdef __KVM_HAVE_MSIX
case KVM_DEV_IRQ_GUEST_MSIX:
r = assigned_device_enable_guest_msix(kvm, dev, irq);
break;
-#endif
default:
r = -EINVAL;
}
@@ -826,7 +842,6 @@ out:
}
-#ifdef __KVM_HAVE_MSIX
static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
struct kvm_assigned_msix_nr *entry_nr)
{
@@ -906,7 +921,6 @@ msix_entry_out:
return r;
}
-#endif
static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
struct kvm_assigned_pci_dev *assigned_dev)
@@ -1012,7 +1026,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
-#ifdef __KVM_HAVE_MSIX
case KVM_ASSIGN_SET_MSIX_NR: {
struct kvm_assigned_msix_nr entry_nr;
r = -EFAULT;
@@ -1033,7 +1046,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
-#endif
case KVM_ASSIGN_SET_INTX_MASK: {
struct kvm_assigned_pci_dev assigned_dev;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 156441bcaac8..6525e926f566 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
- F(AVX512CD);
+ F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
/* cpuid 0xD.1.eax */
const u32 kvm_supported_word10_x86_features =
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dd05b9cef6ae..06332cb7e7d1 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -133,4 +133,41 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->ebx & bit(X86_FEATURE_MPX));
}
+
+static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
+}
+
+static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ return best && (best->edx & bit(X86_FEATURE_RDTSCP));
+}
+
+/*
+ * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
+ */
+#define BIT_NRIPS 3
+
+static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
+
+ /*
+ * NRIPS is a scattered cpuid feature, so we can't use
+ * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
+ * position 8, not 3).
+ */
+ return best && (best->edx & bit(BIT_NRIPS));
+}
+#undef BIT_NRIPS
+
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9da95b9daf8d..1505587d06e9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
#define GET_SMSTATE(type, smbase, offset) \
({ \
type __val; \
- int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \
- sizeof(__val), NULL); \
+ int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \
+ sizeof(__val)); \
if (r != X86EMUL_CONTINUE) \
return X86EMUL_UNHANDLEABLE; \
__val; \
@@ -2484,17 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
/*
* Get back to real mode, to prepare a safe state in which to load
- * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed
- * to read_std/write_std are not virtual.
- *
- * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
*/
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (emulator_has_longmode(ctxt)) {
+ struct desc_struct cs_desc;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+ if (cr4 & X86_CR4_PCIDE) {
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+ cr4 &= ~X86_CR4_PCIDE;
+ }
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+ cs_desc.type = 0xb;
+ cs_desc.s = cs_desc.g = cs_desc.p = 1;
+ ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
+ }
+
+ /* For the 64-bit case, this will clear EFER.LMA. */
cr0 = ctxt->ops->get_cr(ctxt, 0);
if (cr0 & X86_CR0_PE)
ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
- cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+ /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */
if (cr4 & X86_CR4_PAE)
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+
+ /* And finally go back to 32-bit mode. */
efer = 0;
ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
@@ -4455,7 +4474,7 @@ static const struct opcode twobyte_table[256] = {
F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
/* 0xA8 - 0xAF */
I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
- II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
+ II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index a8160d2ae362..62cf8c915e95 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
case HV_X64_MSR_TIME_REF_COUNT:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_RESET:
r = true;
break;
}
@@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
data);
case HV_X64_MSR_CRASH_CTL:
return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+ case HV_X64_MSR_RESET:
+ if (data == 1) {
+ vcpu_debug(vcpu, "hyper-v reset requested\n");
+ kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+ }
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -171,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
return 0;
}
-static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+ cputime_t utime, stime;
+
+ task_cputime_adjusted(current, &utime, &stime);
+ return div_u64(cputime_to_nsecs(utime + stime), 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
@@ -205,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
case HV_X64_MSR_TPR:
return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ case HV_X64_MSR_VP_RUNTIME:
+ if (!host)
+ return 1;
+ hv->runtime_offset = data - current_task_runtime_100ns();
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -241,6 +262,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
pdata);
case HV_X64_MSR_CRASH_CTL:
return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+ case HV_X64_MSR_RESET:
+ data = 0;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -277,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_APIC_ASSIST_PAGE:
data = hv->hv_vapic;
break;
+ case HV_X64_MSR_VP_RUNTIME:
+ data = current_task_runtime_100ns() + hv->runtime_offset;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -295,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
mutex_unlock(&vcpu->kvm->lock);
return r;
} else
- return kvm_hv_set_msr(vcpu, msr, data);
+ return kvm_hv_set_msr(vcpu, msr, data, host);
}
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f90952f64e79..08116ff227cc 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -35,6 +35,7 @@
#include <linux/kvm_host.h>
#include <linux/slab.h>
+#include "ioapic.h"
#include "irq.h"
#include "i8254.h"
#include "x86.h"
@@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
s64 interval;
- if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+ if (!ioapic_in_kernel(kvm) ||
+ ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
return;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 856f79105bb5..88d0a92d3f94 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -233,21 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
}
-static void update_handled_vectors(struct kvm_ioapic *ioapic)
-{
- DECLARE_BITMAP(handled_vectors, 256);
- int i;
-
- memset(handled_vectors, 0, sizeof(handled_vectors));
- for (i = 0; i < IOAPIC_NUM_PINS; ++i)
- __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
- memcpy(ioapic->handled_vectors, handled_vectors,
- sizeof(handled_vectors));
- smp_wmb();
-}
-
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
- u32 *tmr)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
union kvm_ioapic_redirect_entry *e;
@@ -260,13 +246,11 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
index == RTC_GSI) {
if (kvm_apic_match_dest(vcpu, NULL, 0,
- e->fields.dest_id, e->fields.dest_mode)) {
+ e->fields.dest_id, e->fields.dest_mode) ||
+ (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
+ kvm_apic_pending_eoi(vcpu, e->fields.vector)))
__set_bit(e->fields.vector,
(unsigned long *)eoi_exit_bitmap);
- if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
- __set_bit(e->fields.vector,
- (unsigned long *)tmr);
- }
}
}
spin_unlock(&ioapic->lock);
@@ -315,7 +299,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
e->bits |= (u32) val;
e->fields.remote_irr = 0;
}
- update_handled_vectors(ioapic);
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@@ -599,7 +582,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
ioapic->id = 0;
memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
rtc_irq_eoi_tracking_reset(ioapic);
- update_handled_vectors(ioapic);
}
static const struct kvm_io_device_ops ioapic_mmio_ops = {
@@ -628,8 +610,10 @@ int kvm_ioapic_init(struct kvm *kvm)
if (ret < 0) {
kvm->arch.vioapic = NULL;
kfree(ioapic);
+ return ret;
}
+ kvm_vcpu_request_scan_ioapic(kvm);
return ret;
}
@@ -666,7 +650,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
ioapic->irr = 0;
ioapic->irr_delivered = 0;
- update_handled_vectors(ioapic);
kvm_vcpu_request_scan_ioapic(kvm);
kvm_ioapic_inject_all(ioapic, state->irr);
spin_unlock(&ioapic->lock);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ca0b0b4e6256..084617d37c74 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -9,6 +9,7 @@ struct kvm;
struct kvm_vcpu;
#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
#define IOAPIC_EDGE_TRIG 0
#define IOAPIC_LEVEL_TRIG 1
@@ -73,7 +74,6 @@ struct kvm_ioapic {
struct kvm *kvm;
void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
- DECLARE_BITMAP(handled_vectors, 256);
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
@@ -98,11 +98,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
return kvm->arch.vioapic;
}
-static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+static inline int ioapic_in_kernel(struct kvm *kvm)
{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- smp_rmb();
- return test_bit(vector, ioapic->handled_vectors);
+ int ret;
+
+ ret = (ioapic_irqchip(kvm) != NULL);
+ return ret;
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -120,7 +121,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, unsigned long *dest_map);
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
- u32 *tmr);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a1ec6a50a05a..097060e33bd6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,14 +38,27 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
/*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+ return v->arch.pending_external_vector != -1;
+}
+
+/*
* check if there is pending interrupt from
* non-APIC source without intack.
*/
static int kvm_cpu_has_extint(struct kvm_vcpu *v)
{
- if (kvm_apic_accept_pic_intr(v))
- return pic_irqchip(v->kvm)->output; /* PIC */
- else
+ u8 accept = kvm_apic_accept_pic_intr(v);
+
+ if (accept) {
+ if (irqchip_split(v->kvm))
+ return pending_userspace_extint(v);
+ else
+ return pic_irqchip(v->kvm)->output;
+ } else
return 0;
}
@@ -57,13 +70,13 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
*/
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
{
- if (!irqchip_in_kernel(v->kvm))
+ if (!lapic_in_kernel(v))
return v->arch.interrupt.pending;
if (kvm_cpu_has_extint(v))
return 1;
- if (kvm_apic_vid_enabled(v->kvm))
+ if (kvm_vcpu_apic_vid_enabled(v))
return 0;
return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
@@ -75,7 +88,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
- if (!irqchip_in_kernel(v->kvm))
+ if (!lapic_in_kernel(v))
return v->arch.interrupt.pending;
if (kvm_cpu_has_extint(v))
@@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
*/
static int kvm_cpu_get_extint(struct kvm_vcpu *v)
{
- if (kvm_cpu_has_extint(v))
- return kvm_pic_read_irq(v->kvm); /* PIC */
- return -1;
+ if (kvm_cpu_has_extint(v)) {
+ if (irqchip_split(v->kvm)) {
+ int vector = v->arch.pending_external_vector;
+
+ v->arch.pending_external_vector = -1;
+ return vector;
+ } else
+ return kvm_pic_read_irq(v->kvm); /* PIC */
+ } else
+ return -1;
}
/*
@@ -103,7 +123,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
{
int vector;
- if (!irqchip_in_kernel(v->kvm))
+ if (!lapic_in_kernel(v))
return v->arch.interrupt.nr;
vector = kvm_cpu_get_extint(v);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 3d782a2c336a..ae5c78f2337d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -83,13 +83,38 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
return kvm->arch.vpic;
}
+static inline int pic_in_kernel(struct kvm *kvm)
+{
+ int ret;
+
+ ret = (pic_irqchip(kvm) != NULL);
+ return ret;
+}
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ return kvm->arch.irqchip_split;
+}
+
static inline int irqchip_in_kernel(struct kvm *kvm)
{
struct kvm_pic *vpic = pic_irqchip(kvm);
+ bool ret;
+
+ ret = (vpic != NULL);
+ ret |= irqchip_split(kvm);
/* Read vpic before kvm->irq_routing. */
smp_rmb();
- return vpic != NULL;
+ return ret;
+}
+
+static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+ /* Same as irqchip_in_kernel(vcpu->kvm), but with less
+ * pointer chasing and no unnecessary memory barriers.
+ */
+ return vcpu->arch.apic != NULL;
}
void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 9efff9e5b58c..84b96d319909 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq)
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
{
trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
@@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
irq->level = 1;
irq->shorthand = 0;
}
+EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, bool line_status)
@@ -123,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
}
-static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm)
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
{
struct kvm_lapic_irq irq;
int r;
+ if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
+ return -EWOULDBLOCK;
+
kvm_set_msi_irq(e, &irq);
if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
@@ -137,42 +142,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
return -EWOULDBLOCK;
}
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- * Other values - No need to retry.
- */
-int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
-{
- struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
- struct kvm_kernel_irq_routing_entry *e;
- int ret = -EINVAL;
- int idx;
-
- trace_kvm_set_irq(irq, level, irq_source_id);
-
- /*
- * Injection into either PIC or IOAPIC might need to scan all CPUs,
- * which would need to be retried from thread context; when same GSI
- * is connected to both PIC and IOAPIC, we'd have to report a
- * partial failure here.
- * Since there's no easy way to do this, we only support injecting MSI
- * which is limited to 1:1 GSI mapping.
- */
- idx = srcu_read_lock(&kvm->irq_srcu);
- if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
- e = &entries[0];
- if (likely(e->type == KVM_IRQ_ROUTING_MSI))
- ret = kvm_set_msi_inatomic(e, kvm);
- else
- ret = -EWOULDBLOCK;
- }
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
-
int kvm_request_irq_source_id(struct kvm *kvm)
{
unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
@@ -208,7 +177,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
goto unlock;
}
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
- if (!irqchip_in_kernel(kvm))
+ if (!ioapic_in_kernel(kvm))
goto unlock;
kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
@@ -297,6 +266,33 @@ out:
return r;
}
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int i, r = 0;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
#define IOAPIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
@@ -328,3 +324,54 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
return kvm_set_irq_routing(kvm, default_routing,
ARRAY_SIZE(default_routing), 0);
}
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+ if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ /* kvm->irq_routing must be read after clearing
+ * KVM_SCAN_IOAPIC. */
+ smp_mb();
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ u32 dest_id, dest_mode;
+ bool level;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ dest_id = (entry->msi.address_lo >> 12) & 0xff;
+ dest_mode = (entry->msi.address_lo >> 2) & 0x1;
+ level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
+ if (level && kvm_apic_match_dest(vcpu, NULL, 0,
+ dest_id, dest_mode)) {
+ u32 vector = entry->msi.data & 0xff;
+
+ __set_bit(vector,
+ (unsigned long *) eoi_exit_bitmap);
+ }
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8d9013c5e1ee..4d30b865be30 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -209,7 +209,7 @@ out:
if (old)
kfree_rcu(old, rcu);
- kvm_vcpu_request_scan_ioapic(kvm);
+ kvm_make_scan_ioapic_request(kvm);
}
static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
struct kvm_lapic *apic = vcpu->arch.apic;
__kvm_apic_update_irr(pir, apic->regs);
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
@@ -390,7 +392,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
vcpu = apic->vcpu;
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
+ if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
/* try to update RVI */
apic_clear_vector(vec, apic->regs + APIC_IRR);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -551,15 +553,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- int i;
-
- for (i = 0; i < 8; i++)
- apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
-}
-
static void apic_update_ppr(struct kvm_lapic *apic)
{
u32 tpr, isrv, ppr, old_ppr;
@@ -764,6 +757,65 @@ out:
return ret;
}
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ struct kvm_apic_map *map;
+ bool ret = false;
+ struct kvm_lapic *dst = NULL;
+
+ if (irq->shorthand)
+ return false;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (!map)
+ goto out;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+ if (irq->dest_id == 0xFF)
+ goto out;
+
+ if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
+ goto out;
+
+ dst = map->phys_map[irq->dest_id];
+ if (dst && kvm_apic_present(dst->vcpu))
+ *dest_vcpu = dst->vcpu;
+ else
+ goto out;
+ } else {
+ u16 cid;
+ unsigned long bitmap = 1;
+ int i, r = 0;
+
+ if (!kvm_apic_logical_map_valid(map))
+ goto out;
+
+ apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+
+ if (cid >= ARRAY_SIZE(map->logical_map))
+ goto out;
+
+ for_each_set_bit(i, &bitmap, 16) {
+ dst = map->logical_map[cid][i];
+ if (++r == 2)
+ goto out;
+ }
+
+ if (dst && kvm_apic_present(dst->vcpu))
+ *dest_vcpu = dst->vcpu;
+ else
+ goto out;
+ }
+
+ ret = true;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
/*
* Add a pending IRQ into lapic.
* Return 1 if successfully added and 0 if discarded.
@@ -781,6 +833,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
case APIC_DM_FIXED:
+ if (unlikely(trig_mode && !level))
+ break;
+
/* FIXME add logic for vcpu on reset */
if (unlikely(!apic_enabled(apic)))
break;
@@ -790,6 +845,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (dest_map)
__set_bit(vcpu->vcpu_id, dest_map);
+ if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
+ if (trig_mode)
+ apic_set_vector(vector, apic->regs + APIC_TMR);
+ else
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
+ }
+
if (kvm_x86_ops->deliver_posted_interrupt)
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
else {
@@ -868,16 +930,32 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
}
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
+{
+ return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap);
+}
+
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
- if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
- int trigger_mode;
- if (apic_test_vector(vector, apic->regs + APIC_TMR))
- trigger_mode = IOAPIC_LEVEL_TRIG;
- else
- trigger_mode = IOAPIC_EDGE_TRIG;
- kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+ int trigger_mode;
+
+ /* Eoi the ioapic only if the ioapic doesn't own the vector. */
+ if (!kvm_ioapic_handles_vector(apic, vector))
+ return;
+
+ /* Request a KVM exit to inform the userspace IOAPIC. */
+ if (irqchip_split(apic->vcpu->kvm)) {
+ apic->vcpu->arch.pending_ioapic_eoi = vector;
+ kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+ return;
}
+
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+
+ kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
}
static int apic_set_eoi(struct kvm_lapic *apic)
@@ -1172,7 +1250,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
/* __delay is delay_tsc whenever the hardware has TSC, thus always. */
@@ -1240,7 +1318,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
local_irq_save(flags);
now = apic->lapic_timer.timer.base->get_time();
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
@@ -1615,7 +1693,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
- apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
+ apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
apic->highest_isr_cache = -1;
update_divide_count(apic);
@@ -1838,7 +1916,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_rtc_eoi_tracking_restore_one(vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_rtc_eoi_tracking_restore_one(vcpu);
+
+ vcpu->arch.apic_arb_prio = 0;
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1922,7 +2003,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
/* Cache not set: could be safe but we don't bother. */
apic->highest_isr_cache == -1 ||
/* Need EOI to update ioapic. */
- kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+ kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
/*
* PV EOI was disabled by apic_sync_pv_eoi_from_guest
* so we need not do anything here.
@@ -1978,7 +2059,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 reg = (msr - APIC_BASE_MSR) << 4;
- if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
if (reg == APIC_ICR2)
@@ -1995,7 +2076,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
- if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
if (reg == APIC_DFR || reg == APIC_ICR2) {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 764037991d26..fde8e35d5850 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
void __kvm_apic_update_irr(u32 *pir, void *regs);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
@@ -144,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
}
-static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
+static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops->vm_has_apicv(kvm);
+ return kvm_x86_ops->cpu_uses_apicv(vcpu);
}
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@@ -169,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
void wait_lapic_expire(struct kvm_vcpu *vcpu);
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff606f507913..e7c2c1428a69 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm->arch.indirect_shadow_pages--;
}
-static int has_wrprotected_page(struct kvm_vcpu *vcpu,
- gfn_t gfn,
- int level)
+static int __has_wrprotected_page(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
{
- struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (slot) {
linfo = lpage_info_slot(gfn, slot, level);
return linfo->write_count;
@@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu,
return 1;
}
+static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return __has_wrprotected_page(gfn, level, slot);
+}
+
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
{
unsigned long page_size;
@@ -851,6 +856,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret;
}
+static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
+ bool no_dirty_log)
+{
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return false;
+ if (no_dirty_log && slot->dirty_bitmap)
+ return false;
+
+ return true;
+}
+
static struct kvm_memory_slot *
gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
@@ -858,21 +874,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
- (no_dirty_log && slot->dirty_bitmap))
+ if (!memslot_valid_for_gpte(slot, no_dirty_log))
slot = NULL;
return slot;
}
-static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
- return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
-}
-
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
+ bool *force_pt_level)
{
int host_level, level, max_level;
+ struct kvm_memory_slot *slot;
+
+ if (unlikely(*force_pt_level))
+ return PT_PAGE_TABLE_LEVEL;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
+ *force_pt_level = !memslot_valid_for_gpte(slot, true);
+ if (unlikely(*force_pt_level))
+ return PT_PAGE_TABLE_LEVEL;
host_level = host_mapping_level(vcpu->kvm, large_gfn);
@@ -882,7 +902,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (has_wrprotected_page(vcpu, large_gfn, level))
+ if (__has_wrprotected_page(large_gfn, level, slot))
break;
return level - 1;
@@ -2962,14 +2982,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
{
int r;
int level;
- int force_pt_level;
+ bool force_pt_level = false;
pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, write = error_code & PFERR_WRITE_MASK;
- force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
- level = mapping_level(vcpu, gfn);
/*
* This path builds a PAE pagetable - so we can map
* 2mb pages at maximum. Therefore check if the level
@@ -2979,8 +2998,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- } else
- level = PT_PAGE_TABLE_LEVEL;
+ }
if (fast_page_fault(vcpu, v, level, error_code))
return 0;
@@ -3341,7 +3359,7 @@ exit:
return reserved;
}
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
@@ -3350,7 +3368,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
return RET_MMIO_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
- if (unlikely(reserved))
+ if (WARN_ON(reserved))
return RET_MMIO_PF_BUG;
if (is_mmio_spte(spte)) {
@@ -3374,17 +3392,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
*/
return RET_MMIO_PF_RETRY;
}
-EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
-
-static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
- u32 error_code, bool direct)
-{
- int ret;
-
- ret = handle_mmio_page_fault_common(vcpu, addr, direct);
- WARN_ON(ret == RET_MMIO_PF_BUG);
- return ret;
-}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
@@ -3395,7 +3403,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+ r = handle_mmio_page_fault(vcpu, gva, true);
if (likely(r != RET_MMIO_PF_INVALID))
return r;
@@ -3427,7 +3435,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
{
- if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+ if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu)))
return false;
@@ -3476,7 +3484,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
pfn_t pfn;
int r;
int level;
- int force_pt_level;
+ bool force_pt_level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
@@ -3485,7 +3493,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
+ r = handle_mmio_page_fault(vcpu, gpa, true);
if (likely(r != RET_MMIO_PF_INVALID))
return r;
@@ -3495,20 +3503,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;
- if (mapping_level_dirty_bitmap(vcpu, gfn) ||
- !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
- force_pt_level = 1;
- else
- force_pt_level = 0;
-
+ force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
+ PT_DIRECTORY_LEVEL);
+ level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
- level = mapping_level(vcpu, gfn);
if (level > PT_DIRECTORY_LEVEL &&
!check_hugepage_cache_consistency(vcpu, gfn, level))
level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- } else
- level = PT_PAGE_TABLE_LEVEL;
+ }
if (fast_page_fault(vcpu, gpa, level, error_code))
return 0;
@@ -3706,7 +3709,7 @@ static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
int maxphyaddr, bool execonly)
{
- int pte;
+ u64 bad_mt_xwr;
rsvd_check->rsvd_bits_mask[0][3] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
@@ -3724,14 +3727,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
- for (pte = 0; pte < 64; pte++) {
- int rwx_bits = pte & 7;
- int mt = pte >> 3;
- if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
- rwx_bits == 0x2 || rwx_bits == 0x6 ||
- (rwx_bits == 0x4 && !execonly))
- rsvd_check->bad_mt_xwr |= (1ull << pte);
+ bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
+ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
+ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
+ if (!execonly) {
+ /* bits 0..2 must not be 100 unless VMX capabilities allow it */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
}
+ rsvd_check->bad_mt_xwr = bad_mt_xwr;
}
static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e4202e41d535..55ffb7b0f95e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -56,13 +56,13 @@ void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
/*
- * Return values of handle_mmio_page_fault_common:
+ * Return values of handle_mmio_page_fault:
* RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
* directly.
* RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
* fault path update the mmio spte.
* RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: bug is detected.
+ * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
*/
enum {
RET_MMIO_PF_EMULATE = 1,
@@ -71,7 +71,7 @@ enum {
RET_MMIO_PF_BUG = -1
};
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
+int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 736e6ab8784d..3058a22a658d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -698,15 +698,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
- int force_pt_level;
+ bool force_pt_level = false;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, addr, error_code,
- mmu_is_nested(vcpu));
+ r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
if (likely(r != RET_MMIO_PF_INVALID))
return r;
@@ -743,15 +742,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
&walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
- if (walker.level >= PT_DIRECTORY_LEVEL)
- force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
- || is_self_change_mapping;
- else
- force_pt_level = 1;
- if (!force_pt_level) {
- level = min(walker.level, mapping_level(vcpu, walker.gfn));
- walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
- }
+ if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
+ level = mapping_level(vcpu, walker.gfn, &force_pt_level);
+ if (likely(!force_pt_level)) {
+ level = min(walker.level, level);
+ walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ }
+ } else
+ force_pt_level = true;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2f9ed1ff0632..83a1c643f9a5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -158,7 +158,8 @@ struct vcpu_svm {
unsigned long int3_rip;
u32 apf_reason;
- u64 tsc_ratio;
+ /* cached guest cpuid flags for faster access */
+ bool nrips_enabled : 1;
};
static DEFINE_PER_CPU(u64, current_tsc_ratio);
@@ -211,7 +212,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
-static u64 __scale_tsc(u64 ratio, u64 tsc);
enum {
VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -891,20 +891,9 @@ static __init int svm_hardware_setup(void)
kvm_enable_efer_bits(EFER_FFXSR);
if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- u64 max;
-
kvm_has_tsc_control = true;
-
- /*
- * Make sure the user can only configure tsc_khz values that
- * fit into a signed integer.
- * A min value is not calculated needed because it will always
- * be 1 on all machines and a value of 0 is used to disable
- * tsc-scaling for the vcpu.
- */
- max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
-
- kvm_max_guest_tsc_khz = max;
+ kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
}
if (nested) {
@@ -968,68 +957,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static u64 __scale_tsc(u64 ratio, u64 tsc)
-{
- u64 mult, frac, _tsc;
-
- mult = ratio >> 32;
- frac = ratio & ((1ULL << 32) - 1);
-
- _tsc = tsc;
- _tsc *= mult;
- _tsc += (tsc >> 32) * frac;
- _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
-
- return _tsc;
-}
-
-static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 _tsc = tsc;
-
- if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
- _tsc = __scale_tsc(svm->tsc_ratio, tsc);
-
- return _tsc;
-}
-
-static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 ratio;
- u64 khz;
-
- /* Guest TSC same frequency as host TSC? */
- if (!scale) {
- svm->tsc_ratio = TSC_RATIO_DEFAULT;
- return;
- }
-
- /* TSC scaling supported? */
- if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- if (user_tsc_khz > tsc_khz) {
- vcpu->arch.tsc_catchup = 1;
- vcpu->arch.tsc_always_catchup = 1;
- } else
- WARN(1, "user requested TSC rate below hardware speed\n");
- return;
- }
-
- khz = user_tsc_khz;
-
- /* TSC scaling required - calculate ratio */
- ratio = khz << 32;
- do_div(ratio, tsc_khz);
-
- if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
- WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
- user_tsc_khz);
- return;
- }
- svm->tsc_ratio = ratio;
-}
-
static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1056,16 +983,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
+static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (host) {
- if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
- WARN_ON(adjustment < 0);
- adjustment = svm_scale_tsc(vcpu, (u64)adjustment);
- }
-
svm->vmcb->control.tsc_offset += adjustment;
if (is_guest_mode(vcpu))
svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1077,16 +998,7 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
- u64 tsc;
-
- tsc = svm_scale_tsc(vcpu, rdtsc());
-
- return target_tsc - tsc;
-}
-
-static void init_vmcb(struct vcpu_svm *svm, bool init_event)
+static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1107,6 +1019,8 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
set_exception_intercept(svm, PF_VECTOR);
set_exception_intercept(svm, UD_VECTOR);
set_exception_intercept(svm, MC_VECTOR);
+ set_exception_intercept(svm, AC_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
set_intercept(svm, INTERCEPT_INTR);
set_intercept(svm, INTERCEPT_NMI);
@@ -1157,8 +1071,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- if (!init_event)
- svm_set_efer(&svm->vcpu, 0);
+ svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
@@ -1212,7 +1125,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
}
- init_vmcb(svm, init_event);
+ init_vmcb(svm);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1233,8 +1146,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto out;
}
- svm->tsc_ratio = TSC_RATIO_DEFAULT;
-
err = kvm_vcpu_init(&svm->vcpu, kvm, id);
if (err)
goto free_svm;
@@ -1268,7 +1179,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
clear_page(svm->vmcb);
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
- init_vmcb(svm, false);
+ init_vmcb(svm);
svm_init_osvw(&svm->vcpu);
@@ -1320,10 +1231,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
- svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
- __this_cpu_write(current_tsc_ratio, svm->tsc_ratio);
- wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+ if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
+ __this_cpu_write(current_tsc_ratio, tsc_ratio);
+ wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
+ }
}
}
@@ -1642,20 +1555,13 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
mark_dirty(svm->vmcb, VMCB_SEG);
}
-static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
+static void update_bp_intercept(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- clr_exception_intercept(svm, DB_VECTOR);
clr_exception_intercept(svm, BP_VECTOR);
- if (svm->nmi_singlestep)
- set_exception_intercept(svm, DB_VECTOR);
-
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
set_exception_intercept(svm, BP_VECTOR);
} else
@@ -1761,7 +1667,6 @@ static int db_interception(struct vcpu_svm *svm)
if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
svm->vmcb->save.rflags &=
~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_bp_intercept(&svm->vcpu);
}
if (svm->vcpu.guest_debug &
@@ -1796,6 +1701,12 @@ static int ud_interception(struct vcpu_svm *svm)
return 1;
}
+static int ac_interception(struct vcpu_svm *svm)
+{
+ kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+ return 1;
+}
+
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1890,7 +1801,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
* so reinitialize it.
*/
clear_page(svm->vmcb);
- init_vmcb(svm, false);
+ init_vmcb(svm);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -2365,7 +2276,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
- nested_vmcb->control.next_rip = vmcb->control.next_rip;
+
+ if (svm->nrips_enabled)
+ nested_vmcb->control.next_rip = vmcb->control.next_rip;
/*
* If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -3060,7 +2973,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
/* instruction emulation calls kvm_set_cr8() */
r = cr_interception(svm);
- if (irqchip_in_kernel(svm->vcpu.kvm))
+ if (lapic_in_kernel(&svm->vcpu))
return r;
if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
return r;
@@ -3071,8 +2984,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
- return vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, host_tsc);
+ return vmcb->control.tsc_offset + host_tsc;
}
static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -3082,7 +2994,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_IA32_TSC: {
msr_info->data = svm->vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, rdtsc());
+ kvm_scale_tsc(vcpu, rdtsc());
break;
}
@@ -3294,24 +3206,11 @@ static int msr_interception(struct vcpu_svm *svm)
static int interrupt_window_interception(struct vcpu_svm *svm)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
mark_dirty(svm->vmcb, VMCB_INTR);
++svm->vcpu.stat.irq_window_exits;
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(svm->vcpu.kvm) &&
- kvm_run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(&svm->vcpu)) {
- kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
-
return 1;
}
@@ -3371,6 +3270,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
[SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
[SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
[SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
[SVM_EXIT_SMI] = nop_on_interception,
@@ -3659,12 +3559,12 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
return;
}
-static int svm_vm_has_apicv(struct kvm *kvm)
+static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
{
return 0;
}
-static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
{
return;
}
@@ -3754,7 +3654,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
*/
svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_bp_intercept(vcpu);
}
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4098,6 +3997,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Update nrips enabled cache */
+ svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
}
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -4376,7 +4279,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
- .update_db_bp_intercept = update_db_bp_intercept,
+ .update_bp_intercept = update_bp_intercept,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
.get_segment_base = svm_get_segment_base,
@@ -4425,7 +4328,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
- .vm_has_apicv = svm_vm_has_apicv,
+ .cpu_uses_apicv = svm_cpu_uses_apicv,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.sync_pir_to_irr = svm_sync_pir_to_irr,
@@ -4448,11 +4351,9 @@ static struct kvm_x86_ops svm_x86_ops = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
- .set_tsc_khz = svm_set_tsc_khz,
.read_tsc_offset = svm_read_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
- .adjust_tsc_offset = svm_adjust_tsc_offset,
- .compute_tsc_offset = svm_compute_tsc_offset,
+ .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
.read_l1_tsc = svm_read_l1_tsc,
.set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4eae7c35ddf5..120302511802 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -129,6 +129,24 @@ TRACE_EVENT(kvm_pio,
);
/*
+ * Tracepoint for fast mmio.
+ */
+TRACE_EVENT(kvm_fast_mmio,
+ TP_PROTO(u64 gpa),
+ TP_ARGS(gpa),
+
+ TP_STRUCT__entry(
+ __field(u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = gpa;
+ ),
+
+ TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
+);
+
+/*
* Tracepoint for cpuid.
*/
TRACE_EVENT(kvm_cpuid,
@@ -974,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm,
__entry->smbase)
);
+/*
+ * Tracepoint for VT-d posted-interrupts.
+ */
+TRACE_EVENT(kvm_pi_irte_update,
+ TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
+ unsigned int gvec, u64 pi_desc_addr, bool set),
+ TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, gsi )
+ __field( unsigned int, gvec )
+ __field( u64, pi_desc_addr )
+ __field( bool, set )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->gsi = gsi;
+ __entry->gvec = gvec;
+ __entry->pi_desc_addr = pi_desc_addr;
+ __entry->set = set;
+ ),
+
+ TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+ "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ __entry->set ? "enabled and being updated" : "disabled",
+ __entry->vcpu_id,
+ __entry->gsi,
+ __entry->gvec,
+ __entry->pi_desc_addr)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a8bc64566ab..af823a388c19 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -35,6 +35,7 @@
#include "kvm_cache_regs.h"
#include "x86.h"
+#include <asm/cpu.h>
#include <asm/io.h>
#include <asm/desc.h>
#include <asm/vmx.h>
@@ -45,6 +46,7 @@
#include <asm/debugreg.h>
#include <asm/kexec.h>
#include <asm/apic.h>
+#include <asm/irq_remapping.h>
#include "trace.h"
#include "pmu.h"
@@ -105,6 +107,8 @@ static u64 __read_mostly host_xss;
static bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
+#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -424,6 +428,9 @@ struct nested_vmx {
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+ u16 vpid02;
+ u16 last_vpid;
+
u32 nested_vmx_procbased_ctls_low;
u32 nested_vmx_procbased_ctls_high;
u32 nested_vmx_true_procbased_ctls_low;
@@ -440,14 +447,33 @@ struct nested_vmx {
u32 nested_vmx_misc_low;
u32 nested_vmx_misc_high;
u32 nested_vmx_ept_caps;
+ u32 nested_vmx_vpid_caps;
};
#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
- u32 control; /* bit 0 of control is outstanding notification bit */
- u32 rsvd[7];
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
} __aligned(64);
static bool pi_test_and_set_on(struct pi_desc *pi_desc)
@@ -467,6 +493,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ return clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ return set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@@ -532,8 +582,6 @@ struct vcpu_vmx {
s64 vnmi_blocked_time;
u32 exit_reason;
- bool rdtscp_enabled;
-
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
@@ -563,6 +611,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
@@ -809,7 +862,7 @@ static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static bool vmx_mpx_supported(void);
static bool vmx_xsaves_supported(void);
-static int vmx_vm_has_apicv(struct kvm *kvm);
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -831,6 +884,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
@@ -946,9 +1006,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
-static inline bool vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
{
- return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+ return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
}
static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -983,7 +1043,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
static inline bool cpu_has_vmx_posted_intr(void)
{
- return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+ return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
+ vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
}
static inline bool cpu_has_vmx_apicv(void)
@@ -1062,9 +1123,9 @@ static inline bool cpu_has_vmx_ple(void)
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
-static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
{
- return flexpriority_enabled && irqchip_in_kernel(kvm);
+ return flexpriority_enabled && lapic_in_kernel(vcpu);
}
static inline bool cpu_has_vmx_vpid(void)
@@ -1113,6 +1174,12 @@ static inline bool cpu_has_vmx_pml(void)
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
}
+static inline bool cpu_has_vmx_tsc_scaling(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_TSC_SCALING;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -1157,6 +1224,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
}
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
{
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
@@ -1337,13 +1409,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
__loaded_vmcs_clear, loaded_vmcs, 1);
}
-static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(int vpid)
{
- if (vmx->vpid == 0)
+ if (vpid == 0)
return;
if (cpu_has_vmx_invvpid_single())
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
}
static inline void vpid_sync_vcpu_global(void)
@@ -1352,10 +1424,10 @@ static inline void vpid_sync_vcpu_global(void)
__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
}
-static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+static inline void vpid_sync_context(int vpid)
{
if (cpu_has_vmx_invvpid_single())
- vpid_sync_vcpu_single(vmx);
+ vpid_sync_vcpu_single(vpid);
else
vpid_sync_vcpu_global();
}
@@ -1567,7 +1639,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
- (1u << NM_VECTOR) | (1u << DB_VECTOR);
+ (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1895,6 +1967,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
preempt_enable();
}
+static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ /*
+ * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
+ * are two possible cases:
+ * 1. After running 'pre_block', context switch
+ * happened. For this case, 'sn' was set in
+ * vmx_vcpu_put(), so we need to clear it here.
+ * 2. After running 'pre_block', we were blocked,
+ * and woken up by some other guy. For this case,
+ * we don't need to do anything, 'pi_post_block'
+ * will do everything for us. However, we cannot
+ * check whether it is case #1 or case #2 here
+ * (maybe, not needed), so we also clear sn here,
+ * I think it is not a big deal.
+ */
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
+ if (vcpu->cpu != cpu) {
+ dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+ }
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ }
+
+ /* Allow posting non-urgent interrupts */
+ new.sn = 0;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+}
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -1943,12 +2061,35 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+ /* Setup TSC multiplier */
+ if (cpu_has_vmx_tsc_scaling())
+ vmcs_write64(TSC_MULTIPLIER,
+ vcpu->arch.tsc_scaling_ratio);
+
vmx->loaded_vmcs->cpu = cpu;
}
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+}
+
+static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ /* Set SN when the vCPU is preempted */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
}
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
+ vmx_vcpu_pi_put(vcpu);
+
__vmx_load_host_state(to_vmx(vcpu));
if (!vmm_exclusive) {
__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
@@ -2207,7 +2348,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_TSC_AUX);
- if (index >= 0 && vmx->rdtscp_enabled)
+ if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_STAR is only needed on long mode guests, and only
@@ -2230,15 +2371,16 @@ static void setup_msrs(struct vcpu_vmx *vmx)
/*
* reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset -- 21.3
+ * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
+ * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
*/
-static u64 guest_read_tsc(void)
+static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
{
u64 host_tsc, tsc_offset;
host_tsc = rdtsc();
tsc_offset = vmcs_read64(TSC_OFFSET);
- return host_tsc + tsc_offset;
+ return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
}
/*
@@ -2255,22 +2397,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
return host_tsc + tsc_offset;
}
-/*
- * Engage any workarounds for mis-matched TSC rates. Currently limited to
- * software catchup for faster rates on slower CPUs.
- */
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
- if (!scale)
- return;
-
- if (user_tsc_khz > tsc_khz) {
- vcpu->arch.tsc_catchup = 1;
- vcpu->arch.tsc_always_catchup = 1;
- } else
- WARN(1, "user requested TSC rate below hardware speed\n");
-}
-
static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
{
return vmcs_read64(TSC_OFFSET);
@@ -2302,7 +2428,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
}
}
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
+static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
{
u64 offset = vmcs_read64(TSC_OFFSET);
@@ -2315,11 +2441,6 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
offset + adjustment);
}
-static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
- return target_tsc - rdtsc();
-}
-
static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
@@ -2377,7 +2498,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (vmx_cpu_uses_apicv(&vmx->vcpu))
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_POSTED_INTR;
@@ -2471,10 +2592,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_PCOMMIT;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
@@ -2493,6 +2616,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
} else
vmx->nested.nested_vmx_ept_caps = 0;
+ if (enable_vpid)
+ vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+ else
+ vmx->nested.nested_vmx_vpid_caps = 0;
+
if (enable_unrestricted_guest)
vmx->nested.nested_vmx_secondary_ctls_high |=
SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -2608,7 +2737,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
/* Currently, no nested vpid support */
- *pdata = vmx->nested.nested_vmx_ept_caps;
+ *pdata = vmx->nested.nested_vmx_ept_caps |
+ ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
break;
default:
return 1;
@@ -2642,7 +2772,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_TSC:
- msr_info->data = guest_read_tsc();
+ msr_info->data = guest_read_tsc(vcpu);
break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
@@ -2673,7 +2803,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
- if (!to_vmx(vcpu)->rdtscp_enabled)
+ if (!guest_cpuid_has_rdtscp(vcpu))
return 1;
/* Otherwise falls through */
default:
@@ -2779,7 +2909,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_TSC_AUX:
- if (!vmx->rdtscp_enabled)
+ if (!guest_cpuid_has_rdtscp(vcpu))
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
@@ -2874,6 +3004,8 @@ static int hardware_enable(void)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
/*
* Now we can enable the vmclear operation in kdump
@@ -3015,7 +3147,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_ENABLE_PML;
+ SECONDARY_EXEC_ENABLE_PML |
+ SECONDARY_EXEC_PCOMMIT |
+ SECONDARY_EXEC_TSC_SCALING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -3441,9 +3575,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
{
- vpid_sync_context(to_vmx(vcpu));
+ vpid_sync_context(vpid);
if (enable_ept) {
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
@@ -3451,6 +3585,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
}
}
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
+}
+
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -3644,20 +3783,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
hw_cr4 |= X86_CR4_PSE;
- /*
- * SMEP/SMAP is disabled if CPU is in non-paging mode
- * in hardware. However KVM always uses paging mode to
- * emulate guest non-paging mode with TDP.
- * To emulate this behavior, SMEP/SMAP needs to be
- * manually disabled when guest switches to non-paging
- * mode.
- */
- hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
} else if (!(cr4 & X86_CR4_PAE)) {
hw_cr4 &= ~X86_CR4_PAE;
}
}
+ if (!enable_unrestricted_guest && !is_paging(vcpu))
+ /*
+ * SMEP/SMAP is disabled if CPU is in non-paging mode in
+ * hardware. However KVM always uses paging mode without
+ * unrestricted guest.
+ * To emulate this behavior, SMEP/SMAP needs to be manually
+ * disabled when guest switches to non-paging mode.
+ */
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
return 0;
@@ -4146,29 +4286,28 @@ static int alloc_identity_pagetable(struct kvm *kvm)
return r;
}
-static void allocate_vpid(struct vcpu_vmx *vmx)
+static int allocate_vpid(void)
{
int vpid;
- vmx->vpid = 0;
if (!enable_vpid)
- return;
+ return 0;
spin_lock(&vmx_vpid_lock);
vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
- if (vpid < VMX_NR_VPIDS) {
- vmx->vpid = vpid;
+ if (vpid < VMX_NR_VPIDS)
__set_bit(vpid, vmx_vpid_bitmap);
- }
+ else
+ vpid = 0;
spin_unlock(&vmx_vpid_lock);
+ return vpid;
}
-static void free_vpid(struct vcpu_vmx *vmx)
+static void free_vpid(int vpid)
{
- if (!enable_vpid)
+ if (!enable_vpid || vpid == 0)
return;
spin_lock(&vmx_vpid_lock);
- if (vmx->vpid != 0)
- __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+ __clear_bit(vpid, vmx_vpid_bitmap);
spin_unlock(&vmx_vpid_lock);
}
@@ -4323,9 +4462,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
msr, MSR_TYPE_W);
}
-static int vmx_vm_has_apicv(struct kvm *kvm)
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
{
- return enable_apicv && irqchip_in_kernel(kvm);
+ return enable_apicv && lapic_in_kernel(vcpu);
}
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4369,6 +4508,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_SMP
if (vcpu->mode == IN_GUEST_MODE) {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Currently, we don't support urgent interrupt,
+ * all interrupts are recognized as non-urgent
+ * interrupt, so we cannot post interrupts when
+ * 'SN' is set.
+ *
+ * If the vcpu is in guest mode, it means it is
+ * running instead of being scheduled out and
+ * waiting in the run queue, and that's the only
+ * case when 'SN' is set currently, warning if
+ * 'SN' is set.
+ */
+ WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
+
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
POSTED_INTR_VECTOR);
return true;
@@ -4505,7 +4660,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
{
u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
- if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (!vmx_cpu_uses_apicv(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
return pin_based_exec_ctrl;
}
@@ -4517,7 +4672,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+ if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
exec_control &= ~CPU_BASED_TPR_SHADOW;
#ifdef CONFIG_X86_64
exec_control |= CPU_BASED_CR8_STORE_EXITING |
@@ -4534,7 +4689,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (vmx->vpid == 0)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -4548,7 +4703,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (!ple_gap)
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
- if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (!vmx_cpu_uses_apicv(&vmx->vcpu))
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4558,8 +4713,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
a current VMCS12
*/
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- /* PML is enabled/disabled in creating/destorying vcpu */
- exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ if (!enable_pml)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ /* Currently, we allow L1 guest to directly run pcommit instruction. */
+ exec_control &= ~SECONDARY_EXEC_PCOMMIT;
return exec_control;
}
@@ -4604,12 +4763,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
+ if (cpu_has_secondary_exec_ctrls())
vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
vmx_secondary_exec_control(vmx));
- }
- if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
+ if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4753,7 +4911,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (cpu_has_vmx_tpr_shadow() && !init_event) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vcpu->kvm))
+ if (cpu_need_tpr_shadow(vcpu))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
__pa(vcpu->arch.apic->regs));
vmcs_write32(TPR_THRESHOLD, 0);
@@ -4761,7 +4919,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (vmx_vm_has_apicv(vcpu->kvm))
+ if (vmx_cpu_uses_apicv(vcpu))
memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
if (vmx->vpid != 0)
@@ -4771,12 +4929,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx->vcpu.arch.cr0 = cr0;
vmx_set_cr4(vcpu, 0);
- if (!init_event)
- vmx_set_efer(vcpu, 0);
+ vmx_set_efer(vcpu, 0);
vmx_fpu_activate(vcpu);
update_exception_bitmap(vcpu);
- vpid_sync_context(vmx);
+ vpid_sync_context(vmx->vpid);
}
/*
@@ -5104,6 +5261,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return handle_rmode_exception(vcpu, ex_no, error_code);
switch (ex_no) {
+ case AC_VECTOR:
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
case DB_VECTOR:
dr6 = vmcs_readl(EXIT_QUALIFICATION);
if (!(vcpu->guest_debug &
@@ -5296,7 +5456,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
u8 cr8 = (u8)val;
err = kvm_set_cr8(vcpu, cr8);
kvm_complete_insn_gp(vcpu, err);
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
return 1;
if (cr8_prev <= cr8)
return 1;
@@ -5510,17 +5670,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
++vcpu->stat.irq_window_exits;
-
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(vcpu->kvm) &&
- vcpu->run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
return 1;
}
@@ -5753,10 +5902,11 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
skip_emulated_instruction(vcpu);
+ trace_kvm_fast_mmio(gpa);
return 1;
}
- ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+ ret = handle_mmio_page_fault(vcpu, gpa, true);
if (likely(ret == RET_MMIO_PF_EMULATE))
return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
EMULATE_DONE;
@@ -5910,6 +6060,25 @@ static void update_ple_window_actual_max(void)
ple_window_grow, INT_MIN);
}
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+static void wakeup_handler(void)
+{
+ struct kvm_vcpu *vcpu;
+ int cpu = smp_processor_id();
+
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+ blocked_vcpu_list) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (pi_test_on(pi_desc) == 1)
+ kvm_vcpu_kick(vcpu);
+ }
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
static __init int hardware_setup(void)
{
int r = -ENOMEM, i, msr;
@@ -6028,6 +6197,12 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_apicv())
enable_apicv = 0;
+ if (cpu_has_vmx_tsc_scaling()) {
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 48;
+ }
+
if (enable_apicv)
kvm_x86_ops->update_cr8_intercept = NULL;
else {
@@ -6096,6 +6271,8 @@ static __init int hardware_setup(void)
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+
return alloc_kvm_area();
out8:
@@ -6627,7 +6804,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
- u32 exec_control;
if (vmx->nested.current_vmptr == -1ull)
return;
@@ -6640,9 +6816,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
they were modified */
copy_shadow_to_vmcs12(vmx);
vmx->nested.sync_shadow_vmcs = false;
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
vmx->nested.posted_intr_nv = -1;
@@ -6662,6 +6837,7 @@ static void free_nested(struct vcpu_vmx *vmx)
return;
vmx->nested.vmxon = false;
+ free_vpid(vmx->nested.vpid02);
nested_release_vmcs12(vmx);
if (enable_shadow_vmcs)
free_vmcs(vmx->nested.current_shadow_vmcs);
@@ -7038,7 +7214,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
gpa_t vmptr;
- u32 exec_control;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -7070,9 +7245,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
if (enable_shadow_vmcs) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER,
__pa(vmx->nested.current_shadow_vmcs));
vmx->nested.sync_shadow_vmcs = true;
@@ -7178,7 +7352,58 @@ static int handle_invept(struct kvm_vcpu *vcpu)
static int handle_invvpid(struct kvm_vcpu *vcpu)
{
- kvm_queue_exception(vcpu, UD_VECTOR);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info;
+ unsigned long type, types;
+ gva_t gva;
+ struct x86_exception e;
+ int vpid;
+
+ if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_VPID) ||
+ !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+
+ if (!(types & (1UL << type))) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return 1;
+ }
+
+ /* according to the intel vmx instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, false, &gva))
+ return 1;
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
+ sizeof(u32), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ switch (type) {
+ case VMX_VPID_EXTENT_ALL_CONTEXT:
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+ nested_vmx_succeed(vcpu);
+ break;
+ default:
+ /* Trap single context invalidation invvpid calls */
+ BUG_ON(1);
+ break;
+ }
+
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7207,6 +7432,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_pcommit(struct kvm_vcpu *vcpu)
+{
+ /* we never catch pcommit instruct for L1 guest. */
+ WARN_ON(1);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7257,6 +7489,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_PCOMMIT] = handle_pcommit,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7558,6 +7791,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* the XSS exit bitmap in vmcs12.
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ case EXIT_REASON_PCOMMIT:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
default:
return true;
}
@@ -7569,10 +7804,9 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
}
-static int vmx_enable_pml(struct vcpu_vmx *vmx)
+static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
{
struct page *pml_pg;
- u32 exec_control;
pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!pml_pg)
@@ -7583,24 +7817,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_ENABLE_PML;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-
return 0;
}
-static void vmx_disable_pml(struct vcpu_vmx *vmx)
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
{
- u32 exec_control;
-
- ASSERT(vmx->pml_pg);
- __free_page(vmx->pml_pg);
- vmx->pml_pg = NULL;
-
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ if (vmx->pml_pg) {
+ __free_page(vmx->pml_pg);
+ vmx->pml_pg = NULL;
+ }
}
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
@@ -7782,6 +8007,9 @@ static void dump_vmcs(void)
vmcs_read32(IDT_VECTORING_INFO_FIELD),
vmcs_read32(IDT_VECTORING_ERROR_CODE));
pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+ if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+ pr_err("TSC Multiplier = 0x%016lx\n",
+ vmcs_readl(TSC_MULTIPLIER));
if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
@@ -7924,10 +8152,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
* apicv
*/
if (!cpu_has_vmx_virtualize_x2apic_mode() ||
- !vmx_vm_has_apicv(vcpu->kvm))
+ !vmx_cpu_uses_apicv(vcpu))
return;
- if (!vm_need_tpr_shadow(vcpu->kvm))
+ if (!cpu_need_tpr_shadow(vcpu))
return;
sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -8029,9 +8257,10 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
}
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
{
- if (!vmx_vm_has_apicv(vcpu->kvm))
+ u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
+ if (!vmx_cpu_uses_apicv(vcpu))
return;
vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8477,8 +8706,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (enable_pml)
- vmx_disable_pml(vmx);
- free_vpid(vmx);
+ vmx_destroy_pml_buffer(vmx);
+ free_vpid(vmx->vpid);
leave_guest_mode(vcpu);
vmx_load_vmcs01(vcpu);
free_nested(vmx);
@@ -8497,7 +8726,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx)
return ERR_PTR(-ENOMEM);
- allocate_vpid(vmx);
+ vmx->vpid = allocate_vpid();
err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
if (err)
@@ -8530,7 +8759,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
put_cpu();
if (err)
goto free_vmcs;
- if (vm_need_virtualize_apic_accesses(kvm)) {
+ if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
err = alloc_apic_access_page(kvm);
if (err)
goto free_vmcs;
@@ -8545,8 +8774,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vmcs;
}
- if (nested)
+ if (nested) {
nested_vmx_setup_ctls_msrs(vmx);
+ vmx->nested.vpid02 = allocate_vpid();
+ }
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
@@ -8559,7 +8790,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
* for the guest, etc.
*/
if (enable_pml) {
- err = vmx_enable_pml(vmx);
+ err = vmx_create_pml_buffer(vmx);
if (err)
goto free_vmcs;
}
@@ -8567,13 +8798,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
return &vmx->vcpu;
free_vmcs:
+ free_vpid(vmx->nested.vpid02);
free_loaded_vmcs(vmx->loaded_vmcs);
free_msrs:
kfree(vmx->guest_msrs);
uninit_vcpu:
kvm_vcpu_uninit(&vmx->vcpu);
free_vcpu:
- free_vpid(vmx);
+ free_vpid(vmx->vpid);
kmem_cache_free(kvm_vcpu_cache, vmx);
return ERR_PTR(err);
}
@@ -8648,49 +8880,67 @@ static int vmx_get_lpage_level(void)
return PT_PDPE_LEVEL;
}
+static void vmcs_set_secondary_exec_control(u32 new_ctl)
+{
+ /*
+ * These bits in the secondary execution controls field
+ * are dynamic, the others are mostly based on the hypervisor
+ * architecture and the guest's CPUID. Do not touch the
+ * dynamic bits.
+ */
+ u32 mask =
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+ u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exec_control;
+ u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
- vmx->rdtscp_enabled = false;
if (vmx_rdtscp_supported()) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- if (exec_control & SECONDARY_EXEC_RDTSCP) {
- best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
- vmx->rdtscp_enabled = true;
- else {
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- }
+ bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
+ if (!rdtscp_enabled)
+ secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
+
+ if (nested) {
+ if (rdtscp_enabled)
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_RDTSCP;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDTSCP;
}
- if (nested && !vmx->rdtscp_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDTSCP;
}
/* Exposing INVPCID only when PCID is exposed */
best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
if (vmx_invpcid_supported() &&
- best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
- guest_cpuid_has_pcid(vcpu)) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- } else {
- if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- }
+ (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
+ !guest_cpuid_has_pcid(vcpu))) {
+ secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+
if (best)
best->ebx &= ~bit(X86_FEATURE_INVPCID);
}
+
+ vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+ if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
+ if (guest_cpuid_has_pcommit(vcpu))
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_PCOMMIT;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_PCOMMIT;
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9298,13 +9548,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (cpu_has_secondary_exec_ctrls()) {
exec_control = vmx_secondary_exec_control(vmx);
- if (!vmx->rdtscp_enabled)
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_APIC_REGISTER_VIRT);
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_PCOMMIT);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9323,7 +9573,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->nested.apic_access_page));
} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
- (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
+ cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
kvm_vcpu_reload_apic_access_page(vcpu);
@@ -9433,12 +9683,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (enable_vpid) {
/*
- * Trivially support vpid by letting L2s share their parent
- * L1's vpid. TODO: move to a more elaborate solution, giving
- * each L2 its own vpid and exposing the vpid feature to L1.
+ * There is no direct mapping between vpid02 and vpid12, the
+ * vpid02 is per-vCPU for L0 and reused while the value of
+ * vpid12 is changed w/ one invvpid during nested vmentry.
+ * The vpid12 is allocated by L1 for L2, so it will not
+ * influence global bitmap(for vpid01 and vpid02 allocation)
+ * even if spawn a lot of nested vCPUs.
*/
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- vmx_flush_tlb(vcpu);
+ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+ if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+ }
+ } else {
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+ vmx_flush_tlb(vcpu);
+ }
+
}
if (nested_cpu_has_ept(vmcs12)) {
@@ -10278,6 +10540,201 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ * we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ * 'NDST' <-- vcpu->pre_pcpu
+ * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ * interrupt is posted for this vCPU, we cannot block it, in
+ * this case, return 1, otherwise, return 0.
+ *
+ */
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ unsigned int dest;
+ struct pi_desc old, new;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ /*
+ * We should not block the vCPU if
+ * an interrupt is posted for it.
+ */
+ if (pi_test_on(pi_desc) == 1) {
+ spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock_irqrestore(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ vcpu->pre_pcpu = -1;
+
+ return 1;
+ }
+
+ WARN((pi_desc->sn == 1),
+ "Warning: SN field of posted-interrupts "
+ "is set before blocking\n");
+
+ /*
+ * Since vCPU can be preempted during this process,
+ * vcpu->cpu could be different with pre_pcpu, we
+ * need to set pre_pcpu as the destination of wakeup
+ * notification event, then we can find the right vCPU
+ * to wakeup in wakeup handler if interrupts happen
+ * when the vCPU is in blocked state.
+ */
+ dest = cpu_physical_id(vcpu->pre_pcpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'wakeup vector' */
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ return 0;
+}
+
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+ unsigned long flags;
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* Allow posting non-urgent interrupts */
+ new.sn = 0;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if(vcpu->pre_pcpu != -1) {
+ spin_lock_irqsave(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock_irqrestore(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ vcpu->pre_pcpu = -1;
+ }
+}
+
+/*
+ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = -EINVAL;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a vCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ *
+ * We will support full lowest-priority interrupt later.
+ */
+
+ kvm_set_msi_irq(e, &irq);
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+ continue;
+
+ vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+ vcpu_info.vector = irq.vector;
+
+ trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+ if (set)
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+ else {
+ /* suppress notification event before unposting */
+ pi_set_sn(vcpu_to_pi_desc(vcpu));
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ pi_clear_sn(vcpu_to_pi_desc(vcpu));
+ }
+
+ if (ret < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -10297,7 +10754,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .update_db_bp_intercept = update_exception_bitmap,
+ .update_bp_intercept = update_exception_bitmap,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.get_segment_base = vmx_get_segment_base,
@@ -10347,7 +10804,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .vm_has_apicv = vmx_vm_has_apicv,
+ .cpu_uses_apicv = vmx_cpu_uses_apicv,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
@@ -10371,11 +10828,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .set_tsc_khz = vmx_set_tsc_khz,
.read_tsc_offset = vmx_read_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
- .adjust_tsc_offset = vmx_adjust_tsc_offset,
- .compute_tsc_offset = vmx_compute_tsc_offset,
+ .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
.read_l1_tsc = vmx_read_l1_tsc,
.set_tdp_cr3 = vmx_set_cr3,
@@ -10394,7 +10849,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+ .pre_block = vmx_pre_block,
+ .post_block = vmx_post_block,
+
.pmu_ops = &intel_pmu_ops,
+
+ .update_pi_irte = vmx_update_pi_irte,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bda65690788e..eed32283d22c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -51,6 +51,8 @@
#include <linux/pci.h>
#include <linux/timekeeper_internal.h>
#include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -64,6 +66,7 @@
#include <asm/fpu/internal.h> /* Ugh! */
#include <asm/pvclock.h>
#include <asm/div64.h>
+#include <asm/irq_remapping.h>
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
@@ -90,10 +93,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-struct kvm_x86_ops *kvm_x86_ops;
+struct kvm_x86_ops *kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
-static bool ignore_msrs = 0;
+static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
unsigned int min_timer_period_us = 500;
@@ -102,20 +105,25 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO);
-bool kvm_has_tsc_control;
+bool __read_mostly kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
-u32 kvm_max_guest_tsc_khz;
+u32 __read_mostly kvm_max_guest_tsc_khz;
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
+EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
+u64 __read_mostly kvm_max_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
+static u64 __read_mostly kvm_default_tsc_scaling_ratio;
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
-static u32 tsc_tolerance_ppm = 250;
+static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
/* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int lapic_timer_advance_ns = 0;
+unsigned int __read_mostly lapic_timer_advance_ns = 0;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-static bool backwards_tsc_observed = false;
+static bool __read_mostly backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -622,7 +630,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
- if ((cr0 ^ old_cr0) & X86_CR0_CD)
+ if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+ kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+ !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
return 0;
@@ -789,7 +799,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS)
return 1;
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
@@ -799,7 +809,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
return kvm_lapic_get_cr8(vcpu);
else
return vcpu->arch.cr8;
@@ -953,6 +963,9 @@ static u32 emulated_msrs[] = {
HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
@@ -1241,14 +1254,53 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
return v;
}
-static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
+{
+ u64 ratio;
+
+ /* Guest TSC same frequency as host TSC? */
+ if (!scale) {
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return 0;
+ }
+
+ /* TSC scaling supported? */
+ if (!kvm_has_tsc_control) {
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ return 0;
+ } else {
+ WARN(1, "user requested TSC rate below hardware speed\n");
+ return -1;
+ }
+ }
+
+ /* TSC scaling required - calculate ratio */
+ ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+ user_tsc_khz, tsc_khz);
+
+ if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+ WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return -1;
+ }
+
+ vcpu->arch.tsc_scaling_ratio = ratio;
+ return 0;
+}
+
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
{
u32 thresh_lo, thresh_hi;
int use_scaling = 0;
/* tsc_khz can be zero if TSC calibration fails */
- if (this_tsc_khz == 0)
- return;
+ if (this_tsc_khz == 0) {
+ /* set tsc_scaling_ratio to a safe value */
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return -1;
+ }
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
@@ -1268,7 +1320,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
- kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+ return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1314,6 +1366,48 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
+/*
+ * Multiply tsc by a fixed point number represented by ratio.
+ *
+ * The most significant 64-N bits (mult) of ratio represent the
+ * integral part of the fixed point number; the remaining N bits
+ * (frac) represent the fractional part, ie. ratio represents a fixed
+ * point number (mult + frac * 2^(-N)).
+ *
+ * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ */
+static inline u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+}
+
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+{
+ u64 _tsc = tsc;
+ u64 ratio = vcpu->arch.tsc_scaling_ratio;
+
+ if (ratio != kvm_default_tsc_scaling_ratio)
+ _tsc = __scale_tsc(ratio, tsc);
+
+ return _tsc;
+}
+EXPORT_SYMBOL_GPL(kvm_scale_tsc);
+
+static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = kvm_scale_tsc(vcpu, rdtsc());
+
+ return target_tsc - tsc;
+}
+
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+{
+ return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc));
+}
+EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
@@ -1325,7 +1419,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
u64 data = msr->data;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_tsc_offset(vcpu, data);
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -1382,7 +1476,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
data += delta;
- offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
matched = true;
@@ -1439,6 +1533,20 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
EXPORT_SYMBOL_GPL(kvm_write_tsc);
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ WARN_ON(adjustment < 0);
+ adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+ kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
#ifdef CONFIG_X86_64
static cycle_t read_tsc(void)
@@ -1600,7 +1708,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
- unsigned long flags, this_tsc_khz;
+ unsigned long flags, this_tsc_khz, tgt_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -1637,7 +1745,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
kernel_ns = get_kernel_ns();
}
- tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+ tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
/*
* We may have to catch up the TSC to match elapsed wall clock
@@ -1663,7 +1771,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
return 0;
if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
- kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
+ tgt_tsc_khz = kvm_has_tsc_control ?
+ vcpu->virtual_tsc_khz : this_tsc_khz;
+ kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = this_tsc_khz;
@@ -1898,6 +2008,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
static void record_steal_time(struct kvm_vcpu *vcpu)
{
+ accumulate_steal_time(vcpu);
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -2048,12 +2160,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & KVM_MSR_ENABLED))
break;
- vcpu->arch.st.last_steal = current->sched_info.run_delay;
-
- preempt_disable();
- accumulate_steal_time(vcpu);
- preempt_enable();
-
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
break;
@@ -2449,6 +2555,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENABLE_CAP_VM:
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
+ case KVM_CAP_SPLIT_IRQCHIP:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2612,7 +2719,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
- u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+ u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_x86_ops->write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
@@ -2628,7 +2735,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vcpu->cpu = cpu;
}
- accumulate_steal_time(vcpu);
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
}
@@ -2657,17 +2763,50 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
return 0;
}
+static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
+{
+ return (!lapic_in_kernel(vcpu) ||
+ kvm_apic_accept_pic_intr(vcpu));
+}
+
+/*
+ * if userspace requested an interrupt window, check that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_interrupt_allowed(vcpu) &&
+ !kvm_cpu_has_interrupt(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+}
+
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
if (irq->irq >= KVM_NR_INTERRUPTS)
return -EINVAL;
- if (irqchip_in_kernel(vcpu->kvm))
+
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ kvm_queue_interrupt(vcpu, irq->irq, false);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+ }
+
+ /*
+ * With in-kernel LAPIC, we only use this to inject EXTINT, so
+ * fail for in-kernel 8259.
+ */
+ if (pic_in_kernel(vcpu->kvm))
return -ENXIO;
- kvm_queue_interrupt(vcpu, irq->irq, false);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (vcpu->arch.pending_external_vector != -1)
+ return -EEXIST;
+ vcpu->arch.pending_external_vector = irq->irq;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
@@ -3176,7 +3315,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_vapic_addr va;
r = -EINVAL;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
goto out;
r = -EFAULT;
if (copy_from_user(&va, argp, sizeof va))
@@ -3303,9 +3442,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (user_tsc_khz == 0)
user_tsc_khz = tsc_khz;
- kvm_set_tsc_khz(vcpu, user_tsc_khz);
+ if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
+ r = 0;
- r = 0;
goto out;
}
case KVM_GET_TSC_KHZ: {
@@ -3425,41 +3564,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
- int r = 0;
-
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
+ return 0;
}
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
- int r = 0;
-
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
+ return 0;
}
static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
- int r = 0;
-
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
sizeof(ps->channels));
ps->flags = kvm->arch.vpit->pit_state.flags;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
memset(&ps->reserved, 0, sizeof(ps->reserved));
- return r;
+ return 0;
}
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
- int r = 0, start = 0;
+ int start = 0;
u32 prev_legacy, cur_legacy;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3471,7 +3604,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
kvm->arch.vpit->pit_state.flags = ps->flags;
kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
+ return 0;
}
static int kvm_vm_ioctl_reinject(struct kvm *kvm,
@@ -3556,6 +3689,28 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
kvm->arch.disabled_quirks = cap->args[0];
r = 0;
break;
+ case KVM_CAP_SPLIT_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+ goto split_irqchip_unlock;
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto split_irqchip_unlock;
+ if (atomic_read(&kvm->online_vcpus))
+ goto split_irqchip_unlock;
+ r = kvm_setup_empty_irq_routing(kvm);
+ if (r)
+ goto split_irqchip_unlock;
+ /* Pairs with irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_split = true;
+ kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ r = 0;
+split_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -3669,7 +3824,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
+ if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -3693,7 +3848,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
+ if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
@@ -4060,6 +4215,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
}
+static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
+
+ return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
+}
+
int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val,
unsigned int bytes,
@@ -4795,6 +4959,7 @@ static const struct x86_emulate_ops emulate_ops = {
.write_gpr = emulator_write_gpr,
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
+ .read_phys = kvm_read_guest_phys_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
@@ -5667,7 +5832,7 @@ void kvm_arch_exit(void)
int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
- if (irqchip_in_kernel(vcpu->kvm)) {
+ if (lapic_in_kernel(vcpu)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
} else {
@@ -5766,17 +5931,10 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
}
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
{
- return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
- vcpu->run->request_interrupt_window &&
- kvm_arch_interrupt_allowed(vcpu));
+ return vcpu->run->request_interrupt_window &&
+ likely(!pic_in_kernel(vcpu->kvm));
}
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@@ -5787,13 +5945,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_run->ready_for_interrupt_injection = 1;
- else
- kvm_run->ready_for_interrupt_injection =
- kvm_arch_interrupt_allowed(vcpu) &&
- !kvm_cpu_has_interrupt(vcpu) &&
- !kvm_event_needs_reinjection(vcpu);
+ kvm_run->ready_for_interrupt_injection =
+ pic_in_kernel(vcpu->kvm) ||
+ kvm_vcpu_ready_for_interrupt_injection(vcpu);
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -6144,18 +6298,18 @@ static void process_smi(struct kvm_vcpu *vcpu)
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
- u64 eoi_exit_bitmap[4];
- u32 tmr[8];
-
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- memset(eoi_exit_bitmap, 0, 32);
- memset(tmr, 0, 32);
+ memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
- kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
- kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
- kvm_apic_update_tmr(vcpu, tmr);
+ if (irqchip_split(vcpu->kvm))
+ kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
+ else {
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+ }
+ kvm_x86_ops->load_eoi_exitmap(vcpu);
}
static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -6168,7 +6322,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
struct page *page = NULL;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
return;
if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6206,8 +6360,10 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
- bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
- vcpu->run->request_interrupt_window;
+ bool req_int_win =
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+
bool req_immediate_exit = false;
if (vcpu->requests) {
@@ -6258,6 +6414,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_pmu_handle_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_pmu_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+ BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+ if (test_bit(vcpu->arch.pending_ioapic_eoi,
+ (void *) vcpu->arch.eoi_exit_bitmap)) {
+ vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+ vcpu->run->eoi.vector =
+ vcpu->arch.pending_ioapic_eoi;
+ r = 0;
+ goto out;
+ }
+ }
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6268,6 +6435,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 0;
goto out;
}
+ if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ r = 0;
+ goto out;
+ }
+ }
+
+ /*
+ * KVM_REQ_EVENT is not set when posted interrupts are set by
+ * VT-d hardware, so we have to update RVI unconditionally.
+ */
+ if (kvm_lapic_enabled(vcpu)) {
+ /*
+ * Update architecture specific hints for APIC
+ * virtual interrupt delivery.
+ */
+ if (kvm_x86_ops->hwapic_irr_update)
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ kvm_lapic_find_highest_irr(vcpu));
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -6286,13 +6473,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
- /*
- * Update architecture specific hints for APIC
- * virtual interrupt delivery.
- */
- if (kvm_x86_ops->hwapic_irr_update)
- kvm_x86_ops->hwapic_irr_update(vcpu,
- kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
@@ -6376,8 +6556,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
- rdtsc());
+ vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
@@ -6428,10 +6607,15 @@ out:
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
- if (!kvm_arch_vcpu_runnable(vcpu)) {
+ if (!kvm_arch_vcpu_runnable(vcpu) &&
+ (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (kvm_x86_ops->post_block)
+ kvm_x86_ops->post_block(vcpu);
+
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
}
@@ -6468,10 +6652,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
for (;;) {
- if (kvm_vcpu_running(vcpu))
+ if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
- else
+ } else {
r = vcpu_block(kvm, vcpu);
+ }
+
if (r <= 0)
break;
@@ -6479,9 +6665,10 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
- if (dm_request_for_irq_injection(vcpu)) {
- r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
+ if (dm_request_for_irq_injection(vcpu) &&
+ kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
+ r = 0;
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.request_irq_exits;
break;
}
@@ -6608,7 +6795,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
/* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm)) {
+ if (!lapic_in_kernel(vcpu)) {
if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
r = -EINVAL;
goto out;
@@ -6932,7 +7119,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops->update_db_bp_intercept(vcpu);
+ kvm_x86_ops->update_bp_intercept(vcpu);
r = 0;
@@ -7281,6 +7468,20 @@ int kvm_arch_hardware_setup(void)
if (r != 0)
return r;
+ if (kvm_has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated needed because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
+ kvm_max_guest_tsc_khz = max;
+
+ kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+ }
+
kvm_init_msr_list();
return 0;
}
@@ -7308,7 +7509,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
{
- return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+ return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
}
struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7377,6 +7578,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
+ vcpu->arch.pending_external_vector = -1;
+
return 0;
fail_free_mce_banks:
@@ -7402,7 +7605,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
kvm_mmu_destroy(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
static_key_slow_dec(&kvm_no_apic_vcpu);
}
@@ -8029,7 +8232,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ if (kvm_x86_ops->update_pi_irte) {
+ irqfd->producer = prod;
+ return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
+ }
+
+ return -EINVAL;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ if (!kvm_x86_ops->update_pi_irte) {
+ WARN_ON(irqfd->producer != NULL);
+ return;
+ }
+
+ WARN_ON(irqfd->producer != prod);
+ irqfd->producer = NULL;
+
+ /*
+ * When producer of consumer is unregistered, we change back to
+ * remapped mode, so we can re-use the current implementation
+ * when the irq is masked/disabed or the consumer side (KVM
+ * int this case doesn't want to receive the interrupts.
+ */
+ ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ if (ret)
+ printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+ " fails: %d\n", irqfd->consumer.token, ret);
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ if (!kvm_x86_ops->update_pi_irte)
+ return -EINVAL;
+
+ return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+}
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
@@ -8044,3 +8299,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 1bf417e9cc13..0f1c6fc3ddd8 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -89,7 +89,7 @@ static struct addr_marker address_markers[] = {
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/*VMALLOC_END*/, "vmalloc() End" },
# ifdef CONFIG_HIGHMEM
- { 0/*PKMAP_BASE*/, "Persisent kmap() Area" },
+ { 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
# endif
{ 0/*FIXADDR_START*/, "Fixmap Area" },
#endif
@@ -358,6 +358,21 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif
+#ifdef CONFIG_X86_64
+static inline bool is_hypervisor_range(int idx)
+{
+ /*
+ * ffff800000000000 - ffff87ffffffffff is reserved for
+ * the hypervisor.
+ */
+ return paravirt_enabled() &&
+ (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
+ (idx < pgd_index(__PAGE_OFFSET));
+}
+#else
+static inline bool is_hypervisor_range(int idx) { return false; }
+#endif
+
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx)
{
@@ -381,7 +396,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
for (i = 0; i < PTRS_PER_PGD; i++) {
st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
- if (!pgd_none(*start)) {
+ if (!pgd_none(*start) && !is_hypervisor_range(i)) {
if (pgd_large(*start) || !pgd_present(*start)) {
prot = pgd_flags(*start);
note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index eecb207a2037..a6d739258137 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -104,20 +104,6 @@ void __kunmap_atomic(void *kvaddr)
}
EXPORT_SYMBOL(__kunmap_atomic);
-struct page *kmap_atomic_to_page(void *ptr)
-{
- unsigned long idx, vaddr = (unsigned long)ptr;
- pte_t *pte;
-
- if (vaddr < FIXADDR_START)
- return virt_to_page(ptr);
-
- idx = virt_to_fix(vaddr);
- pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
- return pte_page(*pte);
-}
-EXPORT_SYMBOL(kmap_atomic_to_page);
-
void __init set_highmem_pages_init(void)
{
struct zone *zone;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1f37cb2b56a9..493f54172b4a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -354,7 +354,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
}
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
+ pr_debug(" [mem %#010lx-%#010lx] page %s\n",
mr[i].start, mr[i].end - 1,
page_size_string(&mr[i]));
@@ -401,7 +401,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
unsigned long ret = 0;
int nr_range, i;
- pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+ pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
start, end - 1);
memset(mr, 0, sizeof(mr));
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5ed62eff31bd..ec081fe0ce2c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1270,7 +1270,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
/* check to see if we have contiguous blocks */
if (p_end != p || node_start != node) {
if (p_start)
- printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
addr_start, addr_end-1, p_start, p_end-1, node_start);
addr_start = addr;
node_start = node;
@@ -1368,7 +1368,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
void __meminit vmemmap_populate_print_last(void)
{
if (p_start) {
- printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
addr_start, addr_end-1, p_start, p_end-1, node_start);
p_start = NULL;
p_end = NULL;
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 9ce5da27b136..d470cf219a2d 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -126,5 +126,5 @@ void __init kasan_init(void)
__flush_tlb_all();
init_task.kasan_depth = 0;
- pr_info("Kernel address sanitizer initialized\n");
+ pr_info("KernelAddressSanitizer initialized\n");
}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index b0ae85f90f10..b2fd67da1701 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -101,19 +101,19 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
switch (type) {
case REG_TYPE_RM:
regno = X86_MODRM_RM(insn->modrm.value);
- if (X86_REX_B(insn->rex_prefix.value) == 1)
+ if (X86_REX_B(insn->rex_prefix.value))
regno += 8;
break;
case REG_TYPE_INDEX:
regno = X86_SIB_INDEX(insn->sib.value);
- if (X86_REX_X(insn->rex_prefix.value) == 1)
+ if (X86_REX_X(insn->rex_prefix.value))
regno += 8;
break;
case REG_TYPE_BASE:
regno = X86_SIB_BASE(insn->sib.value);
- if (X86_REX_B(insn->rex_prefix.value) == 1)
+ if (X86_REX_B(insn->rex_prefix.value))
regno += 8;
break;
@@ -586,6 +586,29 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
}
/*
+ * We only want to do a 4-byte get_user() on 32-bit. Otherwise,
+ * we might run off the end of the bounds table if we are on
+ * a 64-bit kernel and try to get 8 bytes.
+ */
+int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
+ long __user *bd_entry_ptr)
+{
+ u32 bd_entry_32;
+ int ret;
+
+ if (is_64bit_mm(mm))
+ return get_user(*bd_entry_ret, bd_entry_ptr);
+
+ /*
+ * Note that get_user() uses the type of the *pointer* to
+ * establish the size of the get, not the destination.
+ */
+ ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr);
+ *bd_entry_ret = bd_entry_32;
+ return ret;
+}
+
+/*
* Get the base of bounds tables pointed by specific bounds
* directory entry.
*/
@@ -605,7 +628,7 @@ static int get_bt_addr(struct mm_struct *mm,
int need_write = 0;
pagefault_disable();
- ret = get_user(bd_entry, bd_entry_ptr);
+ ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr);
pagefault_enable();
if (!ret)
break;
@@ -700,11 +723,23 @@ static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
*/
static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
{
- unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
- if (is_64bit_mm(mm))
- return virt_space / MPX_BD_NR_ENTRIES_64;
- else
- return virt_space / MPX_BD_NR_ENTRIES_32;
+ unsigned long long virt_space;
+ unsigned long long GB = (1ULL << 30);
+
+ /*
+ * This covers 32-bit emulation as well as 32-bit kernels
+ * running on 64-bit harware.
+ */
+ if (!is_64bit_mm(mm))
+ return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
+
+ /*
+ * 'x86_virt_bits' returns what the hardware is capable
+ * of, and returns the full >32-bit adddress space when
+ * running 32-bit kernels on 64-bit hardware.
+ */
+ virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
+ return virt_space / MPX_BD_NR_ENTRIES_64;
}
/*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 70efcd0940f9..75991979f667 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1109,7 +1109,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
bpf_flush_icache(header, image + proglen);
set_memory_ro((unsigned long)header, header->pages);
prog->bpf_func = (void *)image;
- prog->jited = true;
+ prog->jited = 1;
}
out:
kfree(addrs);
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index ff9911707160..3cd69832d7f4 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -4,16 +4,15 @@
#include <linux/irq.h>
#include <linux/dmi.h>
#include <linux/slab.h>
+#include <linux/pci-acpi.h>
#include <asm/numa.h>
#include <asm/pci_x86.h>
struct pci_root_info {
- struct acpi_device *bridge;
- char name[16];
+ struct acpi_pci_root_info common;
struct pci_sysdata sd;
#ifdef CONFIG_PCI_MMCONFIG
bool mcfg_added;
- u16 segment;
u8 start_bus;
u8 end_bus;
#endif
@@ -178,15 +177,18 @@ static int check_segment(u16 seg, struct device *dev, char *estr)
return 0;
}
-static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
- u8 end, phys_addr_t addr)
+static int setup_mcfg_map(struct acpi_pci_root_info *ci)
{
- int result;
- struct device *dev = &info->bridge->dev;
+ int result, seg;
+ struct pci_root_info *info;
+ struct acpi_pci_root *root = ci->root;
+ struct device *dev = &ci->bridge->dev;
- info->start_bus = start;
- info->end_bus = end;
+ info = container_of(ci, struct pci_root_info, common);
+ info->start_bus = (u8)root->secondary.start;
+ info->end_bus = (u8)root->secondary.end;
info->mcfg_added = false;
+ seg = info->sd.domain;
/* return success if MMCFG is not in use */
if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg)
@@ -195,7 +197,8 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
if (!(pci_probe & PCI_PROBE_MMCONF))
return check_segment(seg, dev, "MMCONFIG is disabled,");
- result = pci_mmconfig_insert(dev, seg, start, end, addr);
+ result = pci_mmconfig_insert(dev, seg, info->start_bus, info->end_bus,
+ root->mcfg_addr);
if (result == 0) {
/* enable MMCFG if it hasn't been enabled yet */
if (raw_pci_ext_ops == NULL)
@@ -208,134 +211,55 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
return 0;
}
-static void teardown_mcfg_map(struct pci_root_info *info)
+static void teardown_mcfg_map(struct acpi_pci_root_info *ci)
{
+ struct pci_root_info *info;
+
+ info = container_of(ci, struct pci_root_info, common);
if (info->mcfg_added) {
- pci_mmconfig_delete(info->segment, info->start_bus,
- info->end_bus);
+ pci_mmconfig_delete(info->sd.domain,
+ info->start_bus, info->end_bus);
info->mcfg_added = false;
}
}
#else
-static int setup_mcfg_map(struct pci_root_info *info,
- u16 seg, u8 start, u8 end,
- phys_addr_t addr)
+static int setup_mcfg_map(struct acpi_pci_root_info *ci)
{
return 0;
}
-static void teardown_mcfg_map(struct pci_root_info *info)
+
+static void teardown_mcfg_map(struct acpi_pci_root_info *ci)
{
}
#endif
-static void validate_resources(struct device *dev, struct list_head *crs_res,
- unsigned long type)
+static int pci_acpi_root_get_node(struct acpi_pci_root *root)
{
- LIST_HEAD(list);
- struct resource *res1, *res2, *root = NULL;
- struct resource_entry *tmp, *entry, *entry2;
-
- BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0);
- root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource;
-
- list_splice_init(crs_res, &list);
- resource_list_for_each_entry_safe(entry, tmp, &list) {
- bool free = false;
- resource_size_t end;
-
- res1 = entry->res;
- if (!(res1->flags & type))
- goto next;
-
- /* Exclude non-addressable range or non-addressable portion */
- end = min(res1->end, root->end);
- if (end <= res1->start) {
- dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n",
- res1);
- free = true;
- goto next;
- } else if (res1->end != end) {
- dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n",
- res1, (unsigned long long)end + 1,
- (unsigned long long)res1->end);
- res1->end = end;
- }
-
- resource_list_for_each_entry(entry2, crs_res) {
- res2 = entry2->res;
- if (!(res2->flags & type))
- continue;
-
- /*
- * I don't like throwing away windows because then
- * our resources no longer match the ACPI _CRS, but
- * the kernel resource tree doesn't allow overlaps.
- */
- if (resource_overlaps(res1, res2)) {
- res2->start = min(res1->start, res2->start);
- res2->end = max(res1->end, res2->end);
- dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n",
- res2, res1);
- free = true;
- goto next;
- }
- }
+ int busnum = root->secondary.start;
+ struct acpi_device *device = root->device;
+ int node = acpi_get_node(device->handle);
-next:
- resource_list_del(entry);
- if (free)
- resource_list_free_entry(entry);
- else
- resource_list_add_tail(entry, crs_res);
+ if (node == NUMA_NO_NODE) {
+ node = x86_pci_root_bus_node(busnum);
+ if (node != 0 && node != NUMA_NO_NODE)
+ dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
+ node);
}
+ if (node != NUMA_NO_NODE && !node_online(node))
+ node = NUMA_NO_NODE;
+
+ return node;
}
-static void add_resources(struct pci_root_info *info,
- struct list_head *resources,
- struct list_head *crs_res)
+static int pci_acpi_root_init_info(struct acpi_pci_root_info *ci)
{
- struct resource_entry *entry, *tmp;
- struct resource *res, *conflict, *root = NULL;
-
- validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM);
- validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO);
-
- resource_list_for_each_entry_safe(entry, tmp, crs_res) {
- res = entry->res;
- if (res->flags & IORESOURCE_MEM)
- root = &iomem_resource;
- else if (res->flags & IORESOURCE_IO)
- root = &ioport_resource;
- else
- BUG_ON(res);
-
- conflict = insert_resource_conflict(root, res);
- if (conflict) {
- dev_info(&info->bridge->dev,
- "ignoring host bridge window %pR (conflicts with %s %pR)\n",
- res, conflict->name, conflict);
- resource_list_destroy_entry(entry);
- }
- }
-
- list_splice_tail(crs_res, resources);
+ return setup_mcfg_map(ci);
}
-static void release_pci_root_info(struct pci_host_bridge *bridge)
+static void pci_acpi_root_release_info(struct acpi_pci_root_info *ci)
{
- struct resource *res;
- struct resource_entry *entry;
- struct pci_root_info *info = bridge->release_data;
-
- resource_list_for_each_entry(entry, &bridge->windows) {
- res = entry->res;
- if (res->parent &&
- (res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
- release_resource(res);
- }
-
- teardown_mcfg_map(info);
- kfree(info);
+ teardown_mcfg_map(ci);
+ kfree(container_of(ci, struct pci_root_info, common));
}
/*
@@ -358,50 +282,47 @@ static bool resource_is_pcicfg_ioport(struct resource *res)
res->start == 0xCF8 && res->end == 0xCFF;
}
-static void probe_pci_root_info(struct pci_root_info *info,
- struct acpi_device *device,
- int busnum, int domain,
- struct list_head *list)
+static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
{
- int ret;
+ struct acpi_device *device = ci->bridge;
+ int busnum = ci->root->secondary.start;
struct resource_entry *entry, *tmp;
+ int status;
- sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
- info->bridge = device;
- ret = acpi_dev_get_resources(device, list,
- acpi_dev_filter_resource_type_cb,
- (void *)(IORESOURCE_IO | IORESOURCE_MEM));
- if (ret < 0)
- dev_warn(&device->dev,
- "failed to parse _CRS method, error code %d\n", ret);
- else if (ret == 0)
- dev_dbg(&device->dev,
- "no IO and memory resources present in _CRS\n");
- else
- resource_list_for_each_entry_safe(entry, tmp, list) {
- if ((entry->res->flags & IORESOURCE_DISABLED) ||
- resource_is_pcicfg_ioport(entry->res))
+ status = acpi_pci_probe_root_resources(ci);
+ if (pci_use_crs) {
+ resource_list_for_each_entry_safe(entry, tmp, &ci->resources)
+ if (resource_is_pcicfg_ioport(entry->res))
resource_list_destroy_entry(entry);
- else
- entry->res->name = info->name;
- }
+ return status;
+ }
+
+ resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
+ dev_printk(KERN_DEBUG, &device->dev,
+ "host bridge window %pR (ignored)\n", entry->res);
+ resource_list_destroy_entry(entry);
+ }
+ x86_pci_root_bus_resources(busnum, &ci->resources);
+
+ return 0;
}
+static struct acpi_pci_root_ops acpi_pci_root_ops = {
+ .pci_ops = &pci_root_ops,
+ .init_info = pci_acpi_root_init_info,
+ .release_info = pci_acpi_root_release_info,
+ .prepare_resources = pci_acpi_root_prepare_resources,
+};
+
struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
{
- struct acpi_device *device = root->device;
- struct pci_root_info *info;
int domain = root->segment;
int busnum = root->secondary.start;
- struct resource_entry *res_entry;
- LIST_HEAD(crs_res);
- LIST_HEAD(resources);
+ int node = pci_acpi_root_get_node(root);
struct pci_bus *bus;
- struct pci_sysdata *sd;
- int node;
if (pci_ignore_seg)
- domain = 0;
+ root->segment = domain = 0;
if (domain && !pci_domains_supported) {
printk(KERN_WARNING "pci_bus %04x:%02x: "
@@ -410,71 +331,33 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
return NULL;
}
- node = acpi_get_node(device->handle);
- if (node == NUMA_NO_NODE) {
- node = x86_pci_root_bus_node(busnum);
- if (node != 0 && node != NUMA_NO_NODE)
- dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
- node);
- }
-
- if (node != NUMA_NO_NODE && !node_online(node))
- node = NUMA_NO_NODE;
-
- info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
- if (!info) {
- printk(KERN_WARNING "pci_bus %04x:%02x: "
- "ignored (out of memory)\n", domain, busnum);
- return NULL;
- }
-
- sd = &info->sd;
- sd->domain = domain;
- sd->node = node;
- sd->companion = device;
-
bus = pci_find_bus(domain, busnum);
if (bus) {
/*
* If the desired bus has been scanned already, replace
* its bus->sysdata.
*/
- memcpy(bus->sysdata, sd, sizeof(*sd));
- kfree(info);
- } else {
- /* insert busn res at first */
- pci_add_resource(&resources, &root->secondary);
+ struct pci_sysdata sd = {
+ .domain = domain,
+ .node = node,
+ .companion = root->device
+ };
- /*
- * _CRS with no apertures is normal, so only fall back to
- * defaults or native bridge info if we're ignoring _CRS.
- */
- probe_pci_root_info(info, device, busnum, domain, &crs_res);
- if (pci_use_crs) {
- add_resources(info, &resources, &crs_res);
- } else {
- resource_list_for_each_entry(res_entry, &crs_res)
- dev_printk(KERN_DEBUG, &device->dev,
- "host bridge window %pR (ignored)\n",
- res_entry->res);
- resource_list_free(&crs_res);
- x86_pci_root_bus_resources(busnum, &resources);
- }
-
- if (!setup_mcfg_map(info, domain, (u8)root->secondary.start,
- (u8)root->secondary.end, root->mcfg_addr))
- bus = pci_create_root_bus(NULL, busnum, &pci_root_ops,
- sd, &resources);
-
- if (bus) {
- pci_scan_child_bus(bus);
- pci_set_host_bridge_release(
- to_pci_host_bridge(bus->bridge),
- release_pci_root_info, info);
- } else {
- resource_list_free(&resources);
- teardown_mcfg_map(info);
- kfree(info);
+ memcpy(bus->sysdata, &sd, sizeof(sd));
+ } else {
+ struct pci_root_info *info;
+
+ info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
+ if (!info)
+ dev_err(&root->device->dev,
+ "pci_bus %04x:%02x: ignored (out of memory)\n",
+ domain, busnum);
+ else {
+ info->sd.domain = domain;
+ info->sd.node = node;
+ info->sd.companion = root->device;
+ bus = acpi_pci_root_create(root, &acpi_pci_root_ops,
+ &info->common, &info->sd);
}
}
@@ -487,9 +370,6 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
pcie_bus_configure_settings(child);
}
- if (bus && node != NUMA_NO_NODE)
- dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node);
-
return bus;
}
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 7bcf06a7cd12..6eb3c8af96e2 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -50,18 +50,9 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
if (!found)
pci_add_resource(resources, &info->busn);
- list_for_each_entry(root_res, &info->resources, list) {
- struct resource *res;
- struct resource *root;
+ list_for_each_entry(root_res, &info->resources, list)
+ pci_add_resource(resources, &root_res->res);
- res = &root_res->res;
- pci_add_resource(resources, res);
- if (res->flags & IORESOURCE_IO)
- root = &ioport_resource;
- else
- root = &iomem_resource;
- insert_resource(root, res);
- }
return;
default_resources:
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index dc78a4a9a466..eccd4d99e6a4 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -675,6 +675,14 @@ int pcibios_add_device(struct pci_dev *dev)
int pcibios_alloc_irq(struct pci_dev *dev)
{
+ /*
+ * If the PCI device was already claimed by core code and has
+ * MSI enabled, probing of the pcibios IRQ will overwrite
+ * dev->irq. So bail out if MSI is already enabled.
+ */
+ if (pci_dev_msi_enabled(dev))
+ return -EBUSY;
+
return pcibios_enable_irq(dev);
}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 5b662c0faf8c..ea6f3802c17b 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -54,7 +54,7 @@ void pcibios_scan_specific_bus(int busn)
}
EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus);
-int __init pci_subsys_init(void)
+static int __init pci_subsys_init(void)
{
/*
* The init function returns an non zero value when
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 06934a8a4872..e5f854ce2d72 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -211,7 +211,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
if (err)
return 1;
- err = convert_fxsr_from_user(&fpx, sc.fpstate);
+ err = convert_fxsr_from_user(&fpx, (void *)sc.fpstate);
if (err)
return 1;
@@ -227,7 +227,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
{
struct user_i387_struct fp;
- err = copy_from_user(&fp, sc.fpstate,
+ err = copy_from_user(&fp, (void *)sc.fpstate,
sizeof(struct user_i387_struct));
if (err)
return 1;
@@ -291,7 +291,7 @@ static int copy_sc_to_user(struct sigcontext __user *to,
#endif
#undef PUTREG
sc.oldmask = mask;
- sc.fpstate = to_fp;
+ sc.fpstate = (unsigned long)to_fp;
err = copy_to_user(to, &sc, sizeof(struct sigcontext));
if (err)
@@ -468,12 +468,10 @@ long sys_sigreturn(void)
struct sigframe __user *frame = (struct sigframe __user *)(sp - 8);
sigset_t set;
struct sigcontext __user *sc = &frame->sc;
- unsigned long __user *oldmask = &sc->oldmask;
- unsigned long __user *extramask = frame->extramask;
int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
- if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) ||
- copy_from_user(&set.sig[1], extramask, sig_size))
+ if (copy_from_user(&set.sig[0], (void *)sc->oldmask, sizeof(set.sig[0])) ||
+ copy_from_user(&set.sig[1], frame->extramask, sig_size))
goto segfault;
set_current_blocked(&set);
@@ -505,6 +503,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
{
struct rt_sigframe __user *frame;
int err = 0, sig = ksig->sig;
+ unsigned long fp_to;
frame = (struct rt_sigframe __user *)
round_down(stack_top - sizeof(struct rt_sigframe), 16);
@@ -526,7 +525,10 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs));
err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs,
set->sig[0]);
- err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate);
+
+ fp_to = (unsigned long)&frame->fpstate;
+
+ err |= __put_user(fp_to, &frame->uc.uc_mcontext.fpstate);
if (sizeof(*set) == 16) {
err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S
index b972649d3a18..98816804e131 100644
--- a/arch/x86/um/stub_32.S
+++ b/arch/x86/um/stub_32.S
@@ -1,6 +1,5 @@
#include <as-layout.h>
- .globl syscall_stub
.section .__syscall_stub, "ax"
.globl batch_syscall_stub
diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S
index 7160b20172d0..ba914b3b8cc4 100644
--- a/arch/x86/um/stub_64.S
+++ b/arch/x86/um/stub_64.S
@@ -1,25 +1,9 @@
#include <as-layout.h>
- .globl syscall_stub
.section .__syscall_stub, "ax"
-syscall_stub:
- syscall
- /* We don't have 64-bit constants, so this constructs the address
- * we need.
- */
- movq $(STUB_DATA >> 32), %rbx
- salq $32, %rbx
- movq $(STUB_DATA & 0xffffffff), %rcx
- or %rcx, %rbx
- movq %rax, (%rbx)
- int3
-
.globl batch_syscall_stub
batch_syscall_stub:
- mov $(STUB_DATA >> 32), %rbx
- sal $32, %rbx
- mov $(STUB_DATA & 0xffffffff), %rax
- or %rax, %rbx
+ mov $(STUB_DATA), %rbx
/* load pointer to first operation */
mov %rbx, %rsp
add $0x10, %rsp
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2745e8ae93f3..4334e511cfc8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -75,6 +75,7 @@
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/pat.h>
+#include <asm/cpu.h>
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
@@ -1892,3 +1893,17 @@ const struct hypervisor_x86 x86_hyper_xen = {
.set_cpu_features = xen_set_cpu_features,
};
EXPORT_SYMBOL(x86_hyper_xen);
+
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num)
+{
+ arch_register_cpu(num);
+}
+EXPORT_SYMBOL(xen_arch_register_cpu);
+
+void xen_arch_unregister_cpu(int num)
+{
+ arch_unregister_cpu(num);
+}
+EXPORT_SYMBOL(xen_arch_unregister_cpu);
+#endif
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 1580e7a5a4cf..e079500b17f3 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -133,7 +133,7 @@ static int __init xlated_setup_gnttab_pages(void)
kfree(pages);
return -ENOMEM;
}
- rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
+ rc = alloc_xenballooned_pages(nr_grant_frames, pages);
if (rc) {
pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
nr_grant_frames, rc);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 41ee3e25fcce..c913ca4f6958 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2494,14 +2494,9 @@ void __init xen_init_mmu_ops(void)
{
x86_init.paging.pagetable_init = xen_pagetable_init;
- /* Optimization - we can use the HVM one but it has no idea which
- * VCPUs are descheduled - which means that it will needlessly IPI
- * them. Xen knows so let it do the job.
- */
- if (xen_feature(XENFEAT_auto_translated_physmap)) {
- pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
return;
- }
+
pv_mmu_ops = xen_mmu_ops;
memset(dummy_mapping, 0xff, PAGE_SIZE);
@@ -2887,6 +2882,7 @@ static int do_remap_gfn(struct vm_area_struct *vma,
addr += range;
if (err_ptr)
err_ptr += batch;
+ cond_resched();
}
out:
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 660b3cfef234..cab9f766bb06 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -530,7 +530,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
* the new pages are installed with cmpxchg; if we lose the race then
* simply free the page we allocated and use the one that's there.
*/
-static bool alloc_p2m(unsigned long pfn)
+int xen_alloc_p2m_entry(unsigned long pfn)
{
unsigned topidx;
unsigned long *top_mfn_p, *mid_mfn;
@@ -540,6 +540,9 @@ static bool alloc_p2m(unsigned long pfn)
unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
unsigned long p2m_pfn;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
ptep = lookup_address(addr, &level);
BUG_ON(!ptep || level != PG_LEVEL_4K);
pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -548,7 +551,7 @@ static bool alloc_p2m(unsigned long pfn)
/* PMD level is missing, allocate a new one */
ptep = alloc_p2m_pmd(addr, pte_pg);
if (!ptep)
- return false;
+ return -ENOMEM;
}
if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
@@ -566,7 +569,7 @@ static bool alloc_p2m(unsigned long pfn)
mid_mfn = alloc_p2m_page();
if (!mid_mfn)
- return false;
+ return -ENOMEM;
p2m_mid_mfn_init(mid_mfn, p2m_missing);
@@ -592,7 +595,7 @@ static bool alloc_p2m(unsigned long pfn)
p2m = alloc_p2m_page();
if (!p2m)
- return false;
+ return -ENOMEM;
if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
p2m_init(p2m);
@@ -625,8 +628,9 @@ static bool alloc_p2m(unsigned long pfn)
HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
}
- return true;
+ return 0;
}
+EXPORT_SYMBOL(xen_alloc_p2m_entry);
unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e)
@@ -688,7 +692,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!alloc_p2m(pfn))
+ int ret;
+
+ ret = xen_alloc_p2m_entry(pfn);
+ if (ret < 0)
return false;
return __set_phys_to_machine(pfn, mfn);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 63320b6d35bc..7ab29518a3b9 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -212,7 +212,7 @@ static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
e_pfn = PFN_DOWN(entry->addr + entry->size);
/* We only care about E820 after this */
- if (e_pfn < *min_pfn)
+ if (e_pfn <= *min_pfn)
continue;
s_pfn = PFN_UP(entry->addr);
@@ -829,6 +829,8 @@ char * __init xen_memory_setup(void)
addr = xen_e820_map[0].addr;
size = xen_e820_map[0].size;
while (i < xen_e820_map_entries) {
+ bool discard = false;
+
chunk_size = size;
type = xen_e820_map[i].type;
@@ -843,10 +845,11 @@ char * __init xen_memory_setup(void)
xen_add_extra_mem(pfn_s, n_pfns);
xen_max_p2m_pfn = pfn_s + n_pfns;
} else
- type = E820_UNUSABLE;
+ discard = true;
}
- xen_align_and_add_e820_region(addr, chunk_size, type);
+ if (!discard)
+ xen_align_and_add_e820_region(addr, chunk_size, type);
addr += chunk_size;
size -= chunk_size;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index feddabdab448..3705eabd7e22 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -68,26 +68,16 @@ static void xen_pv_post_suspend(int suspend_cancelled)
void xen_arch_pre_suspend(void)
{
- int cpu;
-
- for_each_online_cpu(cpu)
- xen_pmu_finish(cpu);
-
if (xen_pv_domain())
xen_pv_pre_suspend();
}
void xen_arch_post_suspend(int cancelled)
{
- int cpu;
-
if (xen_pv_domain())
xen_pv_post_suspend(cancelled);
else
xen_hvm_post_suspend(cancelled);
-
- for_each_online_cpu(cpu)
- xen_pmu_init(cpu);
}
static void xen_vcpu_notify_restore(void *data)
@@ -106,10 +96,20 @@ static void xen_vcpu_notify_suspend(void *data)
void xen_arch_resume(void)
{
+ int cpu;
+
on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
+
+ for_each_online_cpu(cpu)
+ xen_pmu_init(cpu);
}
void xen_arch_suspend(void)
{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
+
on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
}