diff options
38 files changed, 1373 insertions, 282 deletions
diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt new file mode 100644 index 000000000000..410fa673e5b6 --- /dev/null +++ b/Documentation/virtual/kvm/devices/s390_flic.txt @@ -0,0 +1,46 @@ +FLIC (floating interrupt controller) +==================================== + +FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some +machine check interruptions. All interrupts are stored in a per-vm list of +pending interrupts. FLIC performs operations on this list. + +Only one FLIC instance may be instantiated. + +FLIC provides support to +- add interrupts (KVM_DEV_FLIC_ENQUEUE) +- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS) +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS) +- enable/disable for the guest transparent async page faults + +Groups: + KVM_DEV_FLIC_ENQUEUE + Passes a buffer and length into the kernel which are then injected into + the list of pending interrupts. + attr->addr contains the pointer to the buffer and attr->attr contains + the length of the buffer. + The format of the data structure kvm_s390_irq as it is copied from userspace + is defined in usr/include/linux/kvm.h. + + KVM_DEV_FLIC_GET_ALL_IRQS + Copies all floating interrupts into a buffer provided by userspace. + When the buffer is too small it returns -ENOMEM, which is the indication + for userspace to try again with a bigger buffer. + All interrupts remain pending, i.e. are not deleted from the list of + currently pending interrupts. + attr->addr contains the userspace address of the buffer into which all + interrupt data will be copied. + attr->attr contains the size of the buffer in bytes. + + KVM_DEV_FLIC_CLEAR_IRQS + Simply deletes all elements from the list of currently pending floating + interrupts. No interrupts are injected into the guest. + + KVM_DEV_FLIC_APF_ENABLE + Enables async page faults for the guest. So in case of a major page fault + the host is allowed to handle this async and continues the guest. + + KVM_DEV_FLIC_APF_DISABLE_WAIT + Disables async page faults for the guest and waits until already pending + async page faults are done. This is necessary to trigger a completion interrupt + for every init interrupt before migrating the interrupt list. diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index 4bbb5957ed1b..bd93ff6661b8 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -44,11 +44,21 @@ struct airq_iv { struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags); void airq_iv_release(struct airq_iv *iv); -unsigned long airq_iv_alloc_bit(struct airq_iv *iv); -void airq_iv_free_bit(struct airq_iv *iv, unsigned long bit); +unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num); +void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num); unsigned long airq_iv_scan(struct airq_iv *iv, unsigned long start, unsigned long end); +static inline unsigned long airq_iv_alloc_bit(struct airq_iv *iv) +{ + return airq_iv_alloc(iv, 1); +} + +static inline void airq_iv_free_bit(struct airq_iv *iv, unsigned long bit) +{ + airq_iv_free(iv, bit, 1); +} + static inline unsigned long airq_iv_end(struct airq_iv *iv) { return iv->end; diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index 5f8bcc5fe423..35f0faab5361 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -53,6 +53,7 @@ enum interruption_class { IRQIO_PCI, IRQIO_MSI, IRQIO_VIR, + IRQIO_VAI, NMI_NMI, CPU_RST, NR_ARCH_IRQS diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index eef3dd3fd9a9..734d302ba389 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -16,6 +16,7 @@ #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kvm_host.h> +#include <linux/kvm.h> #include <asm/debug.h> #include <asm/cpu.h> @@ -106,7 +107,9 @@ struct kvm_s390_sie_block { __u64 gbea; /* 0x0180 */ __u8 reserved188[24]; /* 0x0188 */ __u32 fac; /* 0x01a0 */ - __u8 reserved1a4[68]; /* 0x01a4 */ + __u8 reserved1a4[58]; /* 0x01a4 */ + __u64 pp; /* 0x01de */ + __u8 reserved1e6[2]; /* 0x01e6 */ __u64 itdba; /* 0x01e8 */ __u8 reserved1f0[16]; /* 0x01f0 */ } __attribute__((packed)); @@ -168,18 +171,6 @@ struct kvm_vcpu_stat { u32 diagnose_9c; }; -struct kvm_s390_io_info { - __u16 subchannel_id; /* 0x0b8 */ - __u16 subchannel_nr; /* 0x0ba */ - __u32 io_int_parm; /* 0x0bc */ - __u32 io_int_word; /* 0x0c0 */ -}; - -struct kvm_s390_ext_info { - __u32 ext_params; - __u64 ext_params2; -}; - #define PGM_OPERATION 0x01 #define PGM_PRIVILEGED_OP 0x02 #define PGM_EXECUTE 0x03 @@ -188,27 +179,6 @@ struct kvm_s390_ext_info { #define PGM_SPECIFICATION 0x06 #define PGM_DATA 0x07 -struct kvm_s390_pgm_info { - __u16 code; -}; - -struct kvm_s390_prefix_info { - __u32 address; -}; - -struct kvm_s390_extcall_info { - __u16 code; -}; - -struct kvm_s390_emerg_info { - __u16 code; -}; - -struct kvm_s390_mchk_info { - __u64 cr14; - __u64 mcic; -}; - struct kvm_s390_interrupt_info { struct list_head list; u64 type; @@ -245,7 +215,7 @@ struct kvm_s390_float_interrupt { int next_rr_cpu; unsigned long idle_mask[(KVM_MAX_VCPUS + sizeof(long) - 1) / sizeof(long)]; - struct kvm_s390_local_interrupt *local_int[KVM_MAX_VCPUS]; + unsigned int irq_count; }; @@ -262,6 +232,10 @@ struct kvm_vcpu_arch { u64 stidp_data; }; struct gmap *gmap; +#define KVM_S390_PFAULT_TOKEN_INVALID (-1UL) + unsigned long pfault_token; + unsigned long pfault_select; + unsigned long pfault_compare; }; struct kvm_vm_stat { @@ -275,6 +249,7 @@ struct kvm_arch{ struct sca_block *sca; debug_info_t *dbf; struct kvm_s390_float_interrupt float_int; + struct kvm_device *flic; struct gmap *gmap; int css_support; }; @@ -287,6 +262,24 @@ static inline bool kvm_is_error_hva(unsigned long addr) return IS_ERR_VALUE(addr); } +#define ASYNC_PF_PER_VCPU 64 +struct kvm_vcpu; +struct kvm_async_pf; +struct kvm_arch_async_pf { + unsigned long pfault_token; +}; + +bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); + +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); + +void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); + +void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); + extern int sie64a(struct kvm_s390_sie_block *, u64 *); extern char sie_exit; #endif diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2204400d0bd5..66101f6c6d81 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -767,6 +767,7 @@ static inline void pgste_set_pte(pte_t *ptep, pte_t entry) * @table: pointer to the page directory * @asce: address space control element for gmap page table * @crst_list: list of all crst tables used in the guest address space + * @pfault_enabled: defines if pfaults are applicable for the guest */ struct gmap { struct list_head list; @@ -775,6 +776,7 @@ struct gmap { unsigned long asce; void *private; struct list_head crst_list; + bool pfault_enabled; }; /** diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 0a876bc543d3..dc5fc4f90e52 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -79,6 +79,7 @@ struct thread_struct { unsigned long ksp; /* kernel stack pointer */ mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ + unsigned int gmap_pfault; /* signal of a pending guest pfault */ struct per_regs per_user; /* User specified PER registers */ struct per_event per_event; /* Cause of the last PER trap */ unsigned long per_flags; /* Flags to control debug behavior */ diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index d25da598ec62..2f0ade24f96a 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -16,6 +16,22 @@ #define __KVM_S390 +/* Device control API: s390-specific devices */ +#define KVM_DEV_FLIC_GET_ALL_IRQS 1 +#define KVM_DEV_FLIC_ENQUEUE 2 +#define KVM_DEV_FLIC_CLEAR_IRQS 3 +#define KVM_DEV_FLIC_APF_ENABLE 4 +#define KVM_DEV_FLIC_APF_DISABLE_WAIT 5 +/* + * We can have up to 4*64k pending subchannels + 8 adapter interrupts, + * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. + * There are also sclp and machine checks. This gives us + * sizeof(kvm_s390_irq)*(4*65536+8+64*64+1+1) = 72 * 266250 = 19170000 + * Lets round up to 8192 pages. + */ +#define KVM_S390_MAX_FLOAT_IRQS 266250 +#define KVM_S390_FLIC_MAX_BUFFER 0x2000000 + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ @@ -57,4 +73,9 @@ struct kvm_sync_regs { #define KVM_REG_S390_EPOCHDIFF (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x2) #define KVM_REG_S390_CPU_TIMER (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x3) #define KVM_REG_S390_CLOCK_COMP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x4) +#define KVM_REG_S390_PFTOKEN (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x5) +#define KVM_REG_S390_PFCOMPARE (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x6) +#define KVM_REG_S390_PFSELECT (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x7) +#define KVM_REG_S390_PP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x8) +#define KVM_REG_S390_GBEA (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x9) #endif diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index bb27a262c44a..c288ef7e47b4 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -84,6 +84,7 @@ static const struct irq_class irqclass_sub_desc[NR_ARCH_IRQS] = { [IRQIO_PCI] = {.name = "PCI", .desc = "[I/O] PCI Interrupt" }, [IRQIO_MSI] = {.name = "MSI", .desc = "[I/O] MSI Interrupt" }, [IRQIO_VIR] = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, + [IRQIO_VAI] = {.name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"}, [NMI_NMI] = {.name = "NMI", .desc = "[NMI] Machine Check"}, [CPU_RST] = {.name = "RST", .desc = "[CPU] CPU Restart"}, }; diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 70b46eacf8e1..c8bacbcd2e5b 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -23,6 +23,8 @@ config KVM select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_EVENTFD + select KVM_ASYNC_PF + select KVM_ASYNC_PF_SYNC ---help--- Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 40b4c6470f88..a47d2c355f68 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -7,7 +7,7 @@ # as published by the Free Software Foundation. KVM := ../../../virt/kvm -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o +common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o ccflags-y := -Ivirt/kvm -Iarch/s390/kvm diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 8216c0e0b2e2..bf9ed34c2bcd 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -17,6 +17,7 @@ #include "kvm-s390.h" #include "trace.h" #include "trace-s390.h" +#include "gaccess.h" static int diag_release_pages(struct kvm_vcpu *vcpu) { @@ -46,6 +47,87 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) return 0; } +static int __diag_page_ref_service(struct kvm_vcpu *vcpu) +{ + struct prs_parm { + u16 code; + u16 subcode; + u16 parm_len; + u16 parm_version; + u64 token_addr; + u64 select_mask; + u64 compare_mask; + u64 zarch; + }; + struct prs_parm parm; + int rc; + u16 rx = (vcpu->arch.sie_block->ipa & 0xf0) >> 4; + u16 ry = (vcpu->arch.sie_block->ipa & 0x0f); + unsigned long hva_token = KVM_HVA_ERR_BAD; + + if (vcpu->run->s.regs.gprs[rx] & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + if (copy_from_guest(vcpu, &parm, vcpu->run->s.regs.gprs[rx], sizeof(parm))) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + switch (parm.subcode) { + case 0: /* TOKEN */ + if (vcpu->arch.pfault_token != KVM_S390_PFAULT_TOKEN_INVALID) { + /* + * If the pagefault handshake is already activated, + * the token must not be changed. We have to return + * decimal 8 instead, as mandated in SC24-6084. + */ + vcpu->run->s.regs.gprs[ry] = 8; + return 0; + } + + if ((parm.compare_mask & parm.select_mask) != parm.compare_mask || + parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + hva_token = gfn_to_hva(vcpu->kvm, gpa_to_gfn(parm.token_addr)); + if (kvm_is_error_hva(hva_token)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + vcpu->arch.pfault_token = parm.token_addr; + vcpu->arch.pfault_select = parm.select_mask; + vcpu->arch.pfault_compare = parm.compare_mask; + vcpu->run->s.regs.gprs[ry] = 0; + rc = 0; + break; + case 1: /* + * CANCEL + * Specification allows to let already pending tokens survive + * the cancel, therefore to reduce code complexity, we assume + * all outstanding tokens are already pending. + */ + if (parm.token_addr || parm.select_mask || + parm.compare_mask || parm.zarch) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + vcpu->run->s.regs.gprs[ry] = 0; + /* + * If the pfault handling was not established or is already + * canceled SC24-6084 requests to return decimal 4. + */ + if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) + vcpu->run->s.regs.gprs[ry] = 4; + else + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + + rc = 0; + break; + default: + rc = -EOPNOTSUPP; + break; + } + + return rc; +} + static int __diag_time_slice_end(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 5, "%s", "diag time slice end"); @@ -150,6 +232,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) return __diag_time_slice_end(vcpu); case 0x9c: return __diag_time_slice_end_directed(vcpu); + case 0x258: + return __diag_page_ref_service(vcpu); case 0x308: return __diag_ipl_functions(vcpu); case 0x500: diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5f79d2d79ca7..1d0f9d532c0b 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -31,7 +31,7 @@ static int is_ioint(u64 type) return ((type & 0xfffe0000u) != 0xfffe0000u); } -static int psw_extint_disabled(struct kvm_vcpu *vcpu) +int psw_extint_disabled(struct kvm_vcpu *vcpu) { return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT); } @@ -78,11 +78,8 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu, return 1; return 0; case KVM_S390_INT_SERVICE: - if (psw_extint_disabled(vcpu)) - return 0; - if (vcpu->arch.sie_block->gcr[0] & 0x200ul) - return 1; - return 0; + case KVM_S390_INT_PFAULT_INIT: + case KVM_S390_INT_PFAULT_DONE: case KVM_S390_INT_VIRTIO: if (psw_extint_disabled(vcpu)) return 0; @@ -150,6 +147,8 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu, case KVM_S390_INT_EXTERNAL_CALL: case KVM_S390_INT_EMERGENCY: case KVM_S390_INT_SERVICE: + case KVM_S390_INT_PFAULT_INIT: + case KVM_S390_INT_PFAULT_DONE: case KVM_S390_INT_VIRTIO: if (psw_extint_disabled(vcpu)) __set_cpuflag(vcpu, CPUSTAT_EXT_INT); @@ -223,6 +222,30 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, rc |= put_guest(vcpu, inti->ext.ext_params, (u32 __user *)__LC_EXT_PARAMS); break; + case KVM_S390_INT_PFAULT_INIT: + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0, + inti->ext.ext_params2); + rc = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE); + rc |= put_guest(vcpu, 0x0600, (u16 __user *) __LC_EXT_CPU_ADDR); + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_EXT_NEW_PSW, sizeof(psw_t)); + rc |= put_guest(vcpu, inti->ext.ext_params2, + (u64 __user *) __LC_EXT_PARAMS2); + break; + case KVM_S390_INT_PFAULT_DONE: + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0, + inti->ext.ext_params2); + rc = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE); + rc |= put_guest(vcpu, 0x0680, (u16 __user *) __LC_EXT_CPU_ADDR); + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_EXT_NEW_PSW, sizeof(psw_t)); + rc |= put_guest(vcpu, inti->ext.ext_params2, + (u64 __user *) __LC_EXT_PARAMS2); + break; case KVM_S390_INT_VIRTIO: VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", inti->ext.ext_params, inti->ext.ext_params2); @@ -357,7 +380,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) return 1; } -static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; @@ -482,6 +505,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) struct kvm_vcpu *vcpu; vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); + vcpu->preempted = true; tasklet_schedule(&vcpu->arch.tasklet); return HRTIMER_NORESTART; @@ -528,6 +552,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) list_for_each_entry_safe(inti, n, &fi->list, list) { if (__interrupt_is_deliverable(vcpu, inti)) { list_del(&inti->list); + fi->irq_count--; deliver = 1; break; } @@ -583,6 +608,7 @@ void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu) if ((inti->type == KVM_S390_MCHK) && __interrupt_is_deliverable(vcpu, inti)) { list_del(&inti->list); + fi->irq_count--; deliver = 1; break; } @@ -650,8 +676,10 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, inti = iter; break; } - if (inti) + if (inti) { list_del_init(&inti->list); + fi->irq_count--; + } if (list_empty(&fi->list)) atomic_set(&fi->active, 0); spin_unlock(&fi->lock); @@ -659,53 +687,101 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, return inti; } -int kvm_s390_inject_vm(struct kvm *kvm, - struct kvm_s390_interrupt *s390int) +static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { struct kvm_s390_local_interrupt *li; struct kvm_s390_float_interrupt *fi; - struct kvm_s390_interrupt_info *inti, *iter; + struct kvm_s390_interrupt_info *iter; + struct kvm_vcpu *dst_vcpu = NULL; int sigcpu; + int rc = 0; + + mutex_lock(&kvm->lock); + fi = &kvm->arch.float_int; + spin_lock(&fi->lock); + if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) { + rc = -EINVAL; + goto unlock_fi; + } + fi->irq_count++; + if (!is_ioint(inti->type)) { + list_add_tail(&inti->list, &fi->list); + } else { + u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word); + + /* Keep I/O interrupts sorted in isc order. */ + list_for_each_entry(iter, &fi->list, list) { + if (!is_ioint(iter->type)) + continue; + if (int_word_to_isc_bits(iter->io.io_int_word) + <= isc_bits) + continue; + break; + } + list_add_tail(&inti->list, &iter->list); + } + atomic_set(&fi->active, 1); + sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); + if (sigcpu == KVM_MAX_VCPUS) { + do { + sigcpu = fi->next_rr_cpu++; + if (sigcpu == KVM_MAX_VCPUS) + sigcpu = fi->next_rr_cpu = 0; + } while (kvm_get_vcpu(kvm, sigcpu) == NULL); + } + dst_vcpu = kvm_get_vcpu(kvm, sigcpu); + li = &dst_vcpu->arch.local_int; + spin_lock_bh(&li->lock); + atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); + if (waitqueue_active(li->wq)) + wake_up_interruptible(li->wq); + kvm_get_vcpu(kvm, sigcpu)->preempted = true; + spin_unlock_bh(&li->lock); +unlock_fi: + spin_unlock(&fi->lock); + mutex_unlock(&kvm->lock); + return rc; +} + +int kvm_s390_inject_vm(struct kvm *kvm, + struct kvm_s390_interrupt *s390int) +{ + struct kvm_s390_interrupt_info *inti; inti = kzalloc(sizeof(*inti), GFP_KERNEL); if (!inti) return -ENOMEM; - switch (s390int->type) { + inti->type = s390int->type; + switch (inti->type) { case KVM_S390_INT_VIRTIO: VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx", s390int->parm, s390int->parm64); - inti->type = s390int->type; inti->ext.ext_params = s390int->parm; inti->ext.ext_params2 = s390int->parm64; break; case KVM_S390_INT_SERVICE: VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm); - inti->type = s390int->type; inti->ext.ext_params = s390int->parm; break; - case KVM_S390_PROGRAM_INT: - case KVM_S390_SIGP_STOP: - case KVM_S390_INT_EXTERNAL_CALL: - case KVM_S390_INT_EMERGENCY: - kfree(inti); - return -EINVAL; + case KVM_S390_INT_PFAULT_DONE: + inti->type = s390int->type; + inti->ext.ext_params2 = s390int->parm64; + break; case KVM_S390_MCHK: VM_EVENT(kvm, 5, "inject: machine check parm64:%llx", s390int->parm64); - inti->type = s390int->type; inti->mchk.cr14 = s390int->parm; /* upper bits are not used */ inti->mchk.mcic = s390int->parm64; break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - if (s390int->type & IOINT_AI_MASK) + if (inti->type & IOINT_AI_MASK) VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)"); else VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x", s390int->type & IOINT_CSSID_MASK, s390int->type & IOINT_SSID_MASK, s390int->type & IOINT_SCHID_MASK); - inti->type = s390int->type; inti->io.subchannel_id = s390int->parm >> 16; inti->io.subchannel_nr = s390int->parm & 0x0000ffffu; inti->io.io_int_parm = s390int->parm64 >> 32; @@ -718,43 +794,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64, 2); - mutex_lock(&kvm->lock); - fi = &kvm->arch.float_int; - spin_lock(&fi->lock); - if (!is_ioint(inti->type)) - list_add_tail(&inti->list, &fi->list); - else { - u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word); - - /* Keep I/O interrupts sorted in isc order. */ - list_for_each_entry(iter, &fi->list, list) { - if (!is_ioint(iter->type)) - continue; - if (int_word_to_isc_bits(iter->io.io_int_word) - <= isc_bits) - continue; - break; - } - list_add_tail(&inti->list, &iter->list); - } - atomic_set(&fi->active, 1); - sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); - if (sigcpu == KVM_MAX_VCPUS) { - do { - sigcpu = fi->next_rr_cpu++; - if (sigcpu == KVM_MAX_VCPUS) - sigcpu = fi->next_rr_cpu = 0; - } while (fi->local_int[sigcpu] == NULL); - } - li = fi->local_int[sigcpu]; - spin_lock_bh(&li->lock); - atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); - if (waitqueue_active(li->wq)) - wake_up_interruptible(li->wq); - spin_unlock_bh(&li->lock); - spin_unlock(&fi->lock); - mutex_unlock(&kvm->lock); - return 0; + return __inject_vm(kvm, inti); } int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, @@ -814,6 +854,10 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, inti->type = s390int->type; inti->mchk.mcic = s390int->parm64; break; + case KVM_S390_INT_PFAULT_INIT: + inti->type = s390int->type; + inti->ext.ext_params2 = s390int->parm64; + break; case KVM_S390_INT_VIRTIO: case KVM_S390_INT_SERVICE: case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: @@ -837,7 +881,237 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); + vcpu->preempted = true; spin_unlock_bh(&li->lock); mutex_unlock(&vcpu->kvm->lock); return 0; } + +static void clear_floating_interrupts(struct kvm *kvm) +{ + struct kvm_s390_float_interrupt *fi; + struct kvm_s390_interrupt_info *n, *inti = NULL; + + mutex_lock(&kvm->lock); + fi = &kvm->arch.float_int; + spin_lock(&fi->lock); + list_for_each_entry_safe(inti, n, &fi->list, list) { + list_del(&inti->list); + kfree(inti); + } + fi->irq_count = 0; + atomic_set(&fi->active, 0); + spin_unlock(&fi->lock); + mutex_unlock(&kvm->lock); +} + +static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti, + u8 *addr) +{ + struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr; + struct kvm_s390_irq irq = {0}; + + irq.type = inti->type; + switch (inti->type) { + case KVM_S390_INT_PFAULT_INIT: + case KVM_S390_INT_PFAULT_DONE: + case KVM_S390_INT_VIRTIO: + case KVM_S390_INT_SERVICE: + irq.u.ext = inti->ext; + break; + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + irq.u.io = inti->io; + break; + case KVM_S390_MCHK: + irq.u.mchk = inti->mchk; + break; + default: + return -EINVAL; + } + + if (copy_to_user(uptr, &irq, sizeof(irq))) + return -EFAULT; + + return 0; +} + +static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len) +{ + struct kvm_s390_interrupt_info *inti; + struct kvm_s390_float_interrupt *fi; + int ret = 0; + int n = 0; + + mutex_lock(&kvm->lock); + fi = &kvm->arch.float_int; + spin_lock(&fi->lock); + + list_for_each_entry(inti, &fi->list, list) { + if (len < sizeof(struct kvm_s390_irq)) { + /* signal userspace to try again */ + ret = -ENOMEM; + break; + } + ret = copy_irq_to_user(inti, buf); + if (ret) + break; + buf += sizeof(struct kvm_s390_irq); + len -= sizeof(struct kvm_s390_irq); + n++; + } + + spin_unlock(&fi->lock); + mutex_unlock(&kvm->lock); + + return ret < 0 ? ret : n; +} + +static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + int r; + + switch (attr->group) { + case KVM_DEV_FLIC_GET_ALL_IRQS: + r = get_all_floating_irqs(dev->kvm, (u8 *) attr->addr, + attr->attr); + break; + default: + r = -EINVAL; + } + + return r; +} + +static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti, + u64 addr) +{ + struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr; + void *target = NULL; + void __user *source; + u64 size; + + if (get_user(inti->type, (u64 __user *)addr)) + return -EFAULT; + + switch (inti->type) { + case KVM_S390_INT_PFAULT_INIT: + case KVM_S390_INT_PFAULT_DONE: + case KVM_S390_INT_VIRTIO: + case KVM_S390_INT_SERVICE: + target = (void *) &inti->ext; + source = &uptr->u.ext; + size = sizeof(inti->ext); + break; + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + target = (void *) &inti->io; + source = &uptr->u.io; + size = sizeof(inti->io); + break; + case KVM_S390_MCHK: + target = (void *) &inti->mchk; + source = &uptr->u.mchk; + size = sizeof(inti->mchk); + break; + default: + return -EINVAL; + } + + if (copy_from_user(target, source, size)) + return -EFAULT; + + return 0; +} + +static int enqueue_floating_irq(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct kvm_s390_interrupt_info *inti = NULL; + int r = 0; + int len = attr->attr; + + if (len % sizeof(struct kvm_s390_irq) != 0) + return -EINVAL; + else if (len > KVM_S390_FLIC_MAX_BUFFER) + return -EINVAL; + + while (len >= sizeof(struct kvm_s390_irq)) { + inti = kzalloc(sizeof(*inti), GFP_KERNEL); + if (!inti) + return -ENOMEM; + + r = copy_irq_from_user(inti, attr->addr); + if (r) { + kfree(inti); + return r; + } + r = __inject_vm(dev->kvm, inti); + if (r) { + kfree(inti); + return r; + } + len -= sizeof(struct kvm_s390_irq); + attr->addr += sizeof(struct kvm_s390_irq); + } + + return r; +} + +static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + int r = 0; + unsigned int i; + struct kvm_vcpu *vcpu; + + switch (attr->group) { + case KVM_DEV_FLIC_ENQUEUE: + r = enqueue_floating_irq(dev, attr); + break; + case KVM_DEV_FLIC_CLEAR_IRQS: + r = 0; + clear_floating_interrupts(dev->kvm); + break; + case KVM_DEV_FLIC_APF_ENABLE: + dev->kvm->arch.gmap->pfault_enabled = 1; + break; + case KVM_DEV_FLIC_APF_DISABLE_WAIT: + dev->kvm->arch.gmap->pfault_enabled = 0; + /* + * Make sure no async faults are in transition when + * clearing the queues. So we don't need to worry + * about late coming workers. + */ + synchronize_srcu(&dev->kvm->srcu); + kvm_for_each_vcpu(i, vcpu, dev->kvm) + kvm_clear_async_pf_completion_queue(vcpu); + break; + default: + r = -EINVAL; + } + + return r; +} + +static int flic_create(struct kvm_device *dev, u32 type) +{ + if (!dev) + return -EINVAL; + if (dev->kvm->arch.flic) + return -EINVAL; + dev->kvm->arch.flic = dev; + return 0; +} + +static void flic_destroy(struct kvm_device *dev) +{ + dev->kvm->arch.flic = NULL; + kfree(dev); +} + +/* s390 floating irq controller (flic) */ +struct kvm_device_ops kvm_flic_ops = { + .name = "kvm-flic", + .get_attr = flic_get_attr, + .set_attr = flic_set_attr, + .create = flic_create, + .destroy = flic_destroy, +}; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e0676f390d57..9136f8d40850 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -152,11 +152,13 @@ int kvm_dev_ioctl_check_extension(long ext) #ifdef CONFIG_KVM_S390_UCONTROL case KVM_CAP_S390_UCONTROL: #endif + case KVM_CAP_ASYNC_PF: case KVM_CAP_SYNC_REGS: case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: + case KVM_CAP_DEVICE_CTRL: r = 1; break; case KVM_CAP_NR_VCPUS: @@ -254,6 +256,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.gmap) goto out_nogmap; kvm->arch.gmap->private = kvm; + kvm->arch.gmap->pfault_enabled = 0; } kvm->arch.css_support = 0; @@ -271,6 +274,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, "%s", "free cpu"); trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id); + kvm_clear_async_pf_completion_queue(vcpu); if (!kvm_is_ucontrol(vcpu->kvm)) { clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn); @@ -320,6 +324,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) /* Section: vcpu related */ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(vcpu); if (kvm_is_ucontrol(vcpu->kvm)) { vcpu->arch.gmap = gmap_alloc(current->mm); if (!vcpu->arch.gmap) @@ -380,6 +386,9 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.guest_fpregs.fpc = 0; asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); vcpu->arch.sie_block->gbea = 1; + vcpu->arch.sie_block->pp = 0; + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(vcpu); atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); } @@ -451,11 +460,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, spin_lock_init(&vcpu->arch.local_int.lock); INIT_LIST_HEAD(&vcpu->arch.local_int.list); vcpu->arch.local_int.float_int = &kvm->arch.float_int; - spin_lock(&kvm->arch.float_int.lock); - kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int; vcpu->arch.local_int.wq = &vcpu->wq; vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags; - spin_unlock(&kvm->arch.float_int.lock); rc = kvm_vcpu_init(vcpu, kvm, id); if (rc) @@ -475,9 +481,7 @@ out: int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - /* kvm common code refers to this, but never calls it */ - BUG(); - return 0; + return kvm_cpu_has_interrupt(vcpu); } void s390_vcpu_block(struct kvm_vcpu *vcpu) @@ -553,6 +557,26 @@ static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, r = put_user(vcpu->arch.sie_block->ckc, (u64 __user *)reg->addr); break; + case KVM_REG_S390_PFTOKEN: + r = put_user(vcpu->arch.pfault_token, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFCOMPARE: + r = put_user(vcpu->arch.pfault_compare, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFSELECT: + r = put_user(vcpu->arch.pfault_select, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PP: + r = put_user(vcpu->arch.sie_block->pp, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_GBEA: + r = put_user(vcpu->arch.sie_block->gbea, + (u64 __user *)reg->addr); + break; default: break; } @@ -582,6 +606,26 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, r = get_user(vcpu->arch.sie_block->ckc, (u64 __user *)reg->addr); break; + case KVM_REG_S390_PFTOKEN: + r = get_user(vcpu->arch.pfault_token, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFCOMPARE: + r = get_user(vcpu->arch.pfault_compare, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFSELECT: + r = get_user(vcpu->arch.pfault_select, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PP: + r = get_user(vcpu->arch.sie_block->pp, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_GBEA: + r = get_user(vcpu->arch.sie_block->gbea, + (u64 __user *)reg->addr); + break; default: break; } @@ -700,10 +744,100 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) return 0; } +static long kvm_arch_fault_in_sync(struct kvm_vcpu *vcpu) +{ + long rc; + hva_t fault = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap); + struct mm_struct *mm = current->mm; + down_read(&mm->mmap_sem); + rc = get_user_pages(current, mm, fault, 1, 1, 0, NULL, NULL); + up_read(&mm->mmap_sem); + return rc; +} + +static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token, + unsigned long token) +{ + struct kvm_s390_interrupt inti; + inti.parm64 = token; + + if (start_token) { + inti.type = KVM_S390_INT_PFAULT_INIT; + WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &inti)); + } else { + inti.type = KVM_S390_INT_PFAULT_DONE; + WARN_ON_ONCE(kvm_s390_inject_vm(vcpu->kvm, &inti)); + } +} + +void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token); + __kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token); +} + +void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + trace_kvm_s390_pfault_done(vcpu, work->arch.pfault_token); + __kvm_inject_pfault_token(vcpu, false, work->arch.pfault_token); +} + +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + /* s390 will always inject the page directly */ +} + +bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +{ + /* + * s390 will always inject the page directly, + * but we still want check_async_completion to cleanup + */ + return true; +} + +static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +{ + hva_t hva; + struct kvm_arch_async_pf arch; + int rc; + + if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) + return 0; + if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) != + vcpu->arch.pfault_compare) + return 0; + if (psw_extint_disabled(vcpu)) + return 0; + if (kvm_cpu_has_interrupt(vcpu)) + return 0; + if (!(vcpu->arch.sie_block->gcr[0] & 0x200ul)) + return 0; + if (!vcpu->arch.gmap->pfault_enabled) + return 0; + + hva = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap); + if (copy_from_guest(vcpu, &arch.pfault_token, vcpu->arch.pfault_token, 8)) + return 0; + + rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); + return rc; +} + static int vcpu_pre_run(struct kvm_vcpu *vcpu) { int rc, cpuflags; + /* + * On s390 notifications for arriving pages will be delivered directly + * to the guest but the house keeping for completed pfaults is + * handled outside the worker. + */ + kvm_check_async_pf_completion(vcpu); + memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16); if (need_resched()) @@ -729,7 +863,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) { - int rc; + int rc = -1; VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); @@ -743,7 +877,16 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) current->thread.gmap_addr; vcpu->run->s390_ucontrol.pgm_code = 0x10; rc = -EREMOTE; - } else { + + } else if (current->thread.gmap_pfault) { + trace_kvm_s390_major_guest_pfault(vcpu); + current->thread.gmap_pfault = 0; + if (kvm_arch_setup_async_pf(vcpu) || + (kvm_arch_fault_in_sync(vcpu) >= 0)) + rc = 0; + } + + if (rc == -1) { VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); trace_kvm_s390_sie_fault(vcpu); rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); @@ -806,7 +949,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); - BUG_ON(vcpu->kvm->arch.float_int.local_int[vcpu->vcpu_id] == NULL); + BUG_ON(kvm_get_vcpu(vcpu->kvm, vcpu->vcpu_id) == NULL); switch (kvm_run->exit_reason) { case KVM_EXIT_S390_SIEIC: diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index f9559b0bd620..ed4750a5bc3c 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -159,4 +159,8 @@ void exit_sie_sync(struct kvm_vcpu *vcpu); /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); +/* implemented in interrupt.c */ +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); +int psw_extint_disabled(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 75beea632a10..ae9e8ee21557 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -396,15 +396,10 @@ static int handle_stidp(struct kvm_vcpu *vcpu) static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; int cpus = 0; int n; - spin_lock(&fi->lock); - for (n = 0; n < KVM_MAX_VCPUS; n++) - if (fi->local_int[n]) - cpus++; - spin_unlock(&fi->lock); + cpus = atomic_read(&vcpu->kvm->online_vcpus); /* deal with other level 3 hypervisors */ if (stsi(mem, 3, 2, 2)) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 87c2b3a3bd3e..3fe44c441609 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -23,29 +23,30 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, u64 *reg) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_local_interrupt *li; + struct kvm_vcpu *dst_vcpu = NULL; + int cpuflags; int rc; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; - spin_lock(&fi->lock); - if (fi->local_int[cpu_addr] == NULL) - rc = SIGP_CC_NOT_OPERATIONAL; - else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags) - & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED))) + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; + + cpuflags = atomic_read(li->cpuflags); + if (!(cpuflags & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED))) rc = SIGP_CC_ORDER_CODE_ACCEPTED; else { *reg &= 0xffffffff00000000UL; - if (atomic_read(fi->local_int[cpu_addr]->cpuflags) - & CPUSTAT_ECALL_PEND) + if (cpuflags & CPUSTAT_ECALL_PEND) *reg |= SIGP_STATUS_EXT_CALL_PENDING; - if (atomic_read(fi->local_int[cpu_addr]->cpuflags) - & CPUSTAT_STOPPED) + if (cpuflags & CPUSTAT_STOPPED) *reg |= SIGP_STATUS_STOPPED; rc = SIGP_CC_STATUS_STORED; } - spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", cpu_addr, rc); return rc; @@ -53,10 +54,9 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li; struct kvm_s390_interrupt_info *inti; - int rc; + struct kvm_vcpu *dst_vcpu = NULL; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; @@ -68,13 +68,10 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) inti->type = KVM_S390_INT_EMERGENCY; inti->emerg.code = vcpu->vcpu_id; - spin_lock(&fi->lock); - li = fi->local_int[cpu_addr]; - if (li == NULL) { - rc = SIGP_CC_NOT_OPERATIONAL; - kfree(inti); - goto unlock; - } + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; spin_lock_bh(&li->lock); list_add_tail(&inti->list, &li->list); atomic_set(&li->active, 1); @@ -82,11 +79,9 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) if (waitqueue_active(li->wq)) wake_up_interruptible(li->wq); spin_unlock_bh(&li->lock); - rc = SIGP_CC_ORDER_CODE_ACCEPTED; VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); -unlock: - spin_unlock(&fi->lock); - return rc; + + return SIGP_CC_ORDER_CODE_ACCEPTED; } static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr, @@ -122,10 +117,9 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr, static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li; struct kvm_s390_interrupt_info *inti; - int rc; + struct kvm_vcpu *dst_vcpu = NULL; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; @@ -137,13 +131,10 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) inti->type = KVM_S390_INT_EXTERNAL_CALL; inti->extcall.code = vcpu->vcpu_id; - spin_lock(&fi->lock); - li = fi->local_int[cpu_addr]; - if (li == NULL) { - rc = SIGP_CC_NOT_OPERATIONAL; - kfree(inti); - goto unlock; - } + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; spin_lock_bh(&li->lock); list_add_tail(&inti->list, &li->list); atomic_set(&li->active, 1); @@ -151,11 +142,9 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) if (waitqueue_active(li->wq)) wake_up_interruptible(li->wq); spin_unlock_bh(&li->lock); - rc = SIGP_CC_ORDER_CODE_ACCEPTED; VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); -unlock: - spin_unlock(&fi->lock); - return rc; + + return SIGP_CC_ORDER_CODE_ACCEPTED; } static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) @@ -189,31 +178,26 @@ out: static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li; + struct kvm_vcpu *dst_vcpu = NULL; int rc; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; - spin_lock(&fi->lock); - li = fi->local_int[cpu_addr]; - if (li == NULL) { - rc = SIGP_CC_NOT_OPERATIONAL; - goto unlock; - } + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; rc = __inject_sigp_stop(li, action); -unlock: - spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr); if ((action & ACTION_STORE_ON_STOP) != 0 && rc == -ESHUTDOWN) { /* If the CPU has already been stopped, we still have * to save the status when doing stop-and-store. This * has to be done after unlocking all spinlocks. */ - struct kvm_vcpu *dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); rc = kvm_s390_store_status_unloaded(dst_vcpu, KVM_S390_STORE_STATUS_NOADDR); } @@ -224,6 +208,8 @@ unlock: static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) { int rc; + unsigned int i; + struct kvm_vcpu *v; switch (parameter & 0xff) { case 0: @@ -231,6 +217,11 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) break; case 1: case 2: + kvm_for_each_vcpu(i, v, vcpu->kvm) { + v->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(v); + } + rc = SIGP_CC_ORDER_CODE_ACCEPTED; break; default: @@ -242,12 +233,18 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, u64 *reg) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; - struct kvm_s390_local_interrupt *li = NULL; + struct kvm_s390_local_interrupt *li; + struct kvm_vcpu *dst_vcpu = NULL; struct kvm_s390_interrupt_info *inti; int rc; u8 tmp; + if (cpu_addr < KVM_MAX_VCPUS) + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; + /* make sure that the new value is valid memory */ address = address & 0x7fffe000u; if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || @@ -261,18 +258,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, if (!inti) return SIGP_CC_BUSY; - spin_lock(&fi->lock); - if (cpu_addr < KVM_MAX_VCPUS) - li = fi->local_int[cpu_addr]; - - if (li == NULL) { - *reg &= 0xffffffff00000000UL; - *reg |= SIGP_STATUS_INCORRECT_STATE; - rc = SIGP_CC_STATUS_STORED; - kfree(inti); - goto out_fi; - } - spin_lock_bh(&li->lock); /* cpu must be in stopped state */ if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { @@ -295,8 +280,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); out_li: spin_unlock_bh(&li->lock); -out_fi: - spin_unlock(&fi->lock); return rc; } @@ -334,28 +317,26 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, u16 cpu_id, static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, u64 *reg) { + struct kvm_s390_local_interrupt *li; + struct kvm_vcpu *dst_vcpu = NULL; int rc; - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; - spin_lock(&fi->lock); - if (fi->local_int[cpu_addr] == NULL) - rc = SIGP_CC_NOT_OPERATIONAL; - else { - if (atomic_read(fi->local_int[cpu_addr]->cpuflags) - & CPUSTAT_RUNNING) { - /* running */ - rc = SIGP_CC_ORDER_CODE_ACCEPTED; - } else { - /* not running */ - *reg &= 0xffffffff00000000UL; - *reg |= SIGP_STATUS_NOT_RUNNING; - rc = SIGP_CC_STATUS_STORED; - } + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; + if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) { + /* running */ + rc = SIGP_CC_ORDER_CODE_ACCEPTED; + } else { + /* not running */ + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_NOT_RUNNING; + rc = SIGP_CC_STATUS_STORED; } - spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sensed running status of cpu %x rc %x", cpu_addr, rc); @@ -366,26 +347,22 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, /* Test whether the destination CPU is available and not busy */ static int sigp_check_callable(struct kvm_vcpu *vcpu, u16 cpu_addr) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li; int rc = SIGP_CC_ORDER_CODE_ACCEPTED; + struct kvm_vcpu *dst_vcpu = NULL; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; - spin_lock(&fi->lock); - li = fi->local_int[cpu_addr]; - if (li == NULL) { - rc = SIGP_CC_NOT_OPERATIONAL; - goto out; - } - + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; spin_lock_bh(&li->lock); if (li->action_bits & ACTION_STOP_ON_STOP) rc = SIGP_CC_BUSY; spin_unlock_bh(&li->lock); -out: - spin_unlock(&fi->lock); + return rc; } diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h index 3db76b2daed7..e8e7213d4cc5 100644 --- a/arch/s390/kvm/trace.h +++ b/arch/s390/kvm/trace.h @@ -30,6 +30,52 @@ TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id, \ __entry->pswmask, __entry->pswaddr, p_args) +TRACE_EVENT(kvm_s390_major_guest_pfault, + TP_PROTO(VCPU_PROTO_COMMON), + TP_ARGS(VCPU_ARGS_COMMON), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + ), + VCPU_TP_PRINTK("%s", "major fault, maybe applicable for pfault") + ); + +TRACE_EVENT(kvm_s390_pfault_init, + TP_PROTO(VCPU_PROTO_COMMON, long pfault_token), + TP_ARGS(VCPU_ARGS_COMMON, pfault_token), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(long, pfault_token) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->pfault_token = pfault_token; + ), + VCPU_TP_PRINTK("init pfault token %ld", __entry->pfault_token) + ); + +TRACE_EVENT(kvm_s390_pfault_done, + TP_PROTO(VCPU_PROTO_COMMON, long pfault_token), + TP_ARGS(VCPU_ARGS_COMMON, pfault_token), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(long, pfault_token) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->pfault_token = pfault_token; + ), + VCPU_TP_PRINTK("done pfault token %ld", __entry->pfault_token) + ); + /* * Tracepoints for SIE entry and exit. */ diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index d95265b2719f..88cef505453b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -50,6 +50,7 @@ #define VM_FAULT_BADMAP 0x020000 #define VM_FAULT_BADACCESS 0x040000 #define VM_FAULT_SIGNAL 0x080000 +#define VM_FAULT_PFAULT 0x100000 static unsigned long store_indication __read_mostly; @@ -227,6 +228,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault) return; } case VM_FAULT_BADCONTEXT: + case VM_FAULT_PFAULT: do_no_context(regs); break; case VM_FAULT_SIGNAL: @@ -264,6 +266,9 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault) */ static inline int do_exception(struct pt_regs *regs, int access) { +#ifdef CONFIG_PGSTE + struct gmap *gmap; +#endif struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; @@ -304,9 +309,10 @@ static inline int do_exception(struct pt_regs *regs, int access) down_read(&mm->mmap_sem); #ifdef CONFIG_PGSTE - if ((current->flags & PF_VCPU) && S390_lowcore.gmap) { - address = __gmap_fault(address, - (struct gmap *) S390_lowcore.gmap); + gmap = (struct gmap *) + ((current->flags & PF_VCPU) ? S390_lowcore.gmap : 0); + if (gmap) { + address = __gmap_fault(address, gmap); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; goto out_up; @@ -315,6 +321,8 @@ static inline int do_exception(struct pt_regs *regs, int access) fault = VM_FAULT_OOM; goto out_up; } + if (gmap->pfault_enabled) + flags |= FAULT_FLAG_RETRY_NOWAIT; } #endif @@ -371,9 +379,19 @@ retry: regs, address); } if (fault & VM_FAULT_RETRY) { +#ifdef CONFIG_PGSTE + if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* FAULT_FLAG_RETRY_NOWAIT has been set, + * mmap_sem has not been released */ + current->thread.gmap_pfault = 1; + fault = VM_FAULT_PFAULT; + goto out_up; + } +#endif /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags &= ~(FAULT_FLAG_ALLOW_RETRY | + FAULT_FLAG_RETRY_NOWAIT); flags |= FAULT_FLAG_TRIED; down_read(&mm->mmap_sem); goto retry; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fdf83afbb7d9..85be627ef5de 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -444,7 +444,6 @@ struct kvm_vcpu_arch { } st; u64 last_guest_tsc; - u64 last_kernel_ns; u64 last_host_tsc; u64 tsc_offset_adjustment; u64 this_tsc_nsec; @@ -599,6 +598,8 @@ struct kvm_arch { bool use_master_clock; u64 master_kernel_ns; cycle_t master_cycle_now; + struct delayed_work kvmclock_update_work; + struct delayed_work kvmclock_sync_work; struct kvm_xen_hvm_config xen_hvm_config; @@ -765,6 +766,7 @@ struct kvm_x86_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage); void (*handle_external_intr)(struct kvm_vcpu *vcpu); + bool (*mpx_supported)(void); }; struct kvm_arch_async_pf { diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2067264fb7f5..7004d21e6219 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -85,6 +85,7 @@ #define VM_EXIT_SAVE_IA32_EFER 0x00100000 #define VM_EXIT_LOAD_IA32_EFER 0x00200000 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 +#define VM_EXIT_CLEAR_BNDCFGS 0x00800000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -95,6 +96,7 @@ #define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 +#define VM_ENTRY_LOAD_BNDCFGS 0x00010000 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff @@ -174,6 +176,8 @@ enum vmcs_field { GUEST_PDPTR2_HIGH = 0x0000280f, GUEST_PDPTR3 = 0x00002810, GUEST_PDPTR3_HIGH = 0x00002811, + GUEST_BNDCFGS = 0x00002812, + GUEST_BNDCFGS_HIGH = 0x00002813, HOST_IA32_PAT = 0x00002c00, HOST_IA32_PAT_HIGH = 0x00002c01, HOST_IA32_EFER = 0x00002c02, diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 554738963b28..dcd047b629ec 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -13,6 +13,8 @@ #define XSTATE_BNDCSR 0x10 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +/* Bit 63 of XCR0 is reserved for future expansion */ +#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) #define FXSAVE_SIZE 512 diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c19fc60ff062..ed821ed45eb6 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -295,6 +295,7 @@ #define MSR_SMI_COUNT 0x00000034 #define MSR_IA32_FEATURE_CONTROL 0x0000003a #define MSR_IA32_TSC_ADJUST 0x0000003b +#define MSR_IA32_BNDCFGS 0x00000d90 #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 713f1b3bad52..0331cb389d68 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -417,7 +417,6 @@ void kvm_disable_steal_time(void) #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { - WARN_ON(kvm_register_clock("primary cpu clock")); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); kvm_spinlock_init(); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e6041094ff26..d9156ceecdff 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -242,7 +242,7 @@ void __init kvmclock_init(void) hv_clock = __va(mem); memset(hv_clock, 0, size); - if (kvm_register_clock("boot clock")) { + if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; memblock_free(mem, size); return; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c6976257eff5..ddc8a7e165df 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -28,7 +28,7 @@ static u32 xstate_required_size(u64 xstate_bv) int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - xstate_bv &= ~XSTATE_FPSSE; + xstate_bv &= XSTATE_EXTEND_MASK; while (xstate_bv) { if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx; @@ -74,8 +74,8 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & host_xcr0 & KVM_SUPPORTED_XCR0; - vcpu->arch.guest_xstate_size = - xstate_required_size(vcpu->arch.guest_supported_xcr0); + vcpu->arch.guest_xstate_size = best->ebx = + xstate_required_size(vcpu->arch.xcr0); } kvm_pmu_cpuid_update(vcpu); @@ -256,6 +256,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; + unsigned f_mpx = kvm_x86_ops->mpx_supported ? + (kvm_x86_ops->mpx_supported() ? F(MPX) : 0) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -303,7 +305,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | + F(ADX); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b531351a587..f5704d9e5ddc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3329,7 +3329,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn, &arch); + return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index cba218a2f08d..b1e6c1bf68d3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -913,7 +913,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't * used by guest then tlbs are not flushed, so guest is allowed to access the * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. + * We set tlbs_dirty to let the notifier know this change and delay the flush + * until such a case actually happens. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -942,7 +943,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return -EINVAL; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - vcpu->kvm->tlbs_dirty++; + vcpu->kvm->tlbs_dirty = true; continue; } @@ -957,7 +958,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - vcpu->kvm->tlbs_dirty++; + vcpu->kvm->tlbs_dirty = true; continue; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e81df8fce027..64d9bb9590e3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2842,6 +2842,7 @@ static int iret_interception(struct vcpu_svm *svm) clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); return 1; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 392752834751..53c324f3cc5e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -441,6 +441,7 @@ struct vcpu_vmx { #endif int gs_ldt_reload_needed; int fs_reload_needed; + u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; @@ -1710,6 +1711,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif + if (boot_cpu_has(X86_FEATURE_MPX)) + rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -1747,6 +1750,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif + if (vmx->host_state.msr_host_bndcfgs) + wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); /* * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. @@ -2479,6 +2484,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_BNDCFGS: + data = vmcs_read64(GUEST_BNDCFGS); + break; case MSR_IA32_FEATURE_CONTROL: if (!nested_vmx_allowed(vcpu)) return 1; @@ -2547,6 +2555,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_BNDCFGS: + vmcs_write64(GUEST_BNDCFGS, data); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; @@ -2837,7 +2848,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT; + VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -2854,7 +2865,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = 0; - opt = VM_ENTRY_LOAD_IA32_PAT; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -7052,6 +7063,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -8634,6 +8651,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, + .mpx_supported = vmx_mpx_supported, }; static int __init vmx_init(void) @@ -8721,6 +8739,8 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b8578432d5b..a45bcac45645 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -595,13 +595,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - u64 xcr0; + u64 xcr0 = xcr; + u64 old_xcr0 = vcpu->arch.xcr0; u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; - xcr0 = xcr; if (!(xcr0 & XSTATE_FP)) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) @@ -616,8 +616,14 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if (xcr0 & ~valid_bits) return 1; + if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) + return 1; + kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; + + if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK) + kvm_update_cpuid(vcpu); return 0; } @@ -879,7 +885,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS }; static unsigned num_msrs_to_save; @@ -1581,7 +1587,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; /* @@ -1623,14 +1628,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * the others. * * So in those cases, request a kvmclock update for all vcpus. - * The worst case for a remote vcpu to update its kvmclock - * is then bounded by maximum nohz sleep latency. + * We need to rate-limit these requests though, as they can + * considerably slow guests that have a large number of vcpus. + * The time for a remote vcpu to update its kvmclock is bound + * by the delay we use to rate-limit the updates. */ -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) + +static void kvmclock_update_fn(struct work_struct *work) { int i; - struct kvm *kvm = v->kvm; + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_update_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -1639,6 +1651,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) } } +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ + struct kvm *kvm = v->kvm; + + set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + schedule_delayed_work(&kvm->arch.kvmclock_update_work, + KVMCLOCK_UPDATE_DELAY); +} + +#define KVMCLOCK_SYNC_PERIOD (300 * HZ) + +static void kvmclock_sync_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_sync_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); + + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -2323,9 +2358,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_VP_INDEX: { int r; struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) - if (v == vcpu) + kvm_for_each_vcpu(r, v, vcpu->kvm) { + if (v == vcpu) { data = r; + break; + } + } break; } case HV_X64_MSR_EOI: @@ -4394,6 +4432,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -6711,6 +6750,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { int r; struct msr_data msr; + struct kvm *kvm = vcpu->kvm; r = vcpu_load(vcpu); if (r) @@ -6721,6 +6761,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); + return r; } @@ -7013,6 +7056,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) pvclock_update_vm_gtod_copy(kvm); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + return 0; } @@ -7050,6 +7096,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8da5823bcde6..392ecbff0030 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -122,7 +122,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); -#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) +#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; extern unsigned int min_timer_period_us; diff --git a/drivers/s390/cio/airq.c b/drivers/s390/cio/airq.c index f055df0b167f..445564c790f6 100644 --- a/drivers/s390/cio/airq.c +++ b/drivers/s390/cio/airq.c @@ -186,55 +186,71 @@ void airq_iv_release(struct airq_iv *iv) EXPORT_SYMBOL(airq_iv_release); /** - * airq_iv_alloc_bit - allocate an irq bit from an interrupt vector + * airq_iv_alloc - allocate irq bits from an interrupt vector * @iv: pointer to an interrupt vector structure + * @num: number of consecutive irq bits to allocate * - * Returns the bit number of the allocated irq, or -1UL if no bit - * is available or the AIRQ_IV_ALLOC flag has not been specified + * Returns the bit number of the first irq in the allocated block of irqs, + * or -1UL if no bit is available or the AIRQ_IV_ALLOC flag has not been + * specified */ -unsigned long airq_iv_alloc_bit(struct airq_iv *iv) +unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num) { - unsigned long bit; + unsigned long bit, i; - if (!iv->avail) + if (!iv->avail || num == 0) return -1UL; spin_lock(&iv->lock); bit = find_first_bit_inv(iv->avail, iv->bits); - if (bit < iv->bits) { - clear_bit_inv(bit, iv->avail); - if (bit >= iv->end) - iv->end = bit + 1; - } else + while (bit + num <= iv->bits) { + for (i = 1; i < num; i++) + if (!test_bit_inv(bit + i, iv->avail)) + break; + if (i >= num) { + /* Found a suitable block of irqs */ + for (i = 0; i < num; i++) + clear_bit_inv(bit + i, iv->avail); + if (bit + num >= iv->end) + iv->end = bit + num + 1; + break; + } + bit = find_next_bit_inv(iv->avail, iv->bits, bit + i + 1); + } + if (bit + num > iv->bits) bit = -1UL; spin_unlock(&iv->lock); return bit; } -EXPORT_SYMBOL(airq_iv_alloc_bit); +EXPORT_SYMBOL(airq_iv_alloc); /** - * airq_iv_free_bit - free an irq bit of an interrupt vector + * airq_iv_free - free irq bits of an interrupt vector * @iv: pointer to interrupt vector structure - * @bit: number of the irq bit to free + * @bit: number of the first irq bit to free + * @num: number of consecutive irq bits to free */ -void airq_iv_free_bit(struct airq_iv *iv, unsigned long bit) +void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num) { - if (!iv->avail) + unsigned long i; + + if (!iv->avail || num == 0) return; spin_lock(&iv->lock); - /* Clear (possibly left over) interrupt bit */ - clear_bit_inv(bit, iv->vector); - /* Make the bit position available again */ - set_bit_inv(bit, iv->avail); - if (bit == iv->end - 1) { + for (i = 0; i < num; i++) { + /* Clear (possibly left over) interrupt bit */ + clear_bit_inv(bit + i, iv->vector); + /* Make the bit positions available again */ + set_bit_inv(bit + i, iv->avail); + } + if (bit + num >= iv->end) { /* Find new end of bit-field */ - while (--iv->end > 0) - if (!test_bit_inv(iv->end - 1, iv->avail)) - break; + while (iv->end > 0 && !test_bit_inv(iv->end - 1, iv->avail)) + iv->end--; } spin_unlock(&iv->lock); } -EXPORT_SYMBOL(airq_iv_free_bit); +EXPORT_SYMBOL(airq_iv_free); /** * airq_iv_scan - scan interrupt vector for non-zero bits diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c index 0fc584832001..6a2b5fdcd552 100644 --- a/drivers/s390/kvm/virtio_ccw.c +++ b/drivers/s390/kvm/virtio_ccw.c @@ -1,7 +1,7 @@ /* * ccw based virtio transport * - * Copyright IBM Corp. 2012 + * Copyright IBM Corp. 2012, 2014 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -32,6 +32,8 @@ #include <asm/cio.h> #include <asm/ccwdev.h> #include <asm/virtio-ccw.h> +#include <asm/isc.h> +#include <asm/airq.h> /* * virtio related functions @@ -58,6 +60,8 @@ struct virtio_ccw_device { unsigned long indicators; unsigned long indicators2; struct vq_config_block *config_block; + bool is_thinint; + void *airq_info; }; struct vq_info_block { @@ -72,15 +76,38 @@ struct virtio_feature_desc { __u8 index; } __packed; +struct virtio_thinint_area { + unsigned long summary_indicator; + unsigned long indicator; + u64 bit_nr; + u8 isc; +} __packed; + struct virtio_ccw_vq_info { struct virtqueue *vq; int num; void *queue; struct vq_info_block *info_block; + int bit_nr; struct list_head node; long cookie; }; +#define VIRTIO_AIRQ_ISC IO_SCH_ISC /* inherit from subchannel */ + +#define VIRTIO_IV_BITS (L1_CACHE_BYTES * 8) +#define MAX_AIRQ_AREAS 20 + +static int virtio_ccw_use_airq = 1; + +struct airq_info { + rwlock_t lock; + u8 summary_indicator; + struct airq_struct airq; + struct airq_iv *aiv; +}; +static struct airq_info *airq_areas[MAX_AIRQ_AREAS]; + #define CCW_CMD_SET_VQ 0x13 #define CCW_CMD_VDEV_RESET 0x33 #define CCW_CMD_SET_IND 0x43 @@ -91,6 +118,7 @@ struct virtio_ccw_vq_info { #define CCW_CMD_WRITE_CONF 0x21 #define CCW_CMD_WRITE_STATUS 0x31 #define CCW_CMD_READ_VQ_CONF 0x32 +#define CCW_CMD_SET_IND_ADAPTER 0x73 #define VIRTIO_CCW_DOING_SET_VQ 0x00010000 #define VIRTIO_CCW_DOING_RESET 0x00040000 @@ -102,6 +130,7 @@ struct virtio_ccw_vq_info { #define VIRTIO_CCW_DOING_SET_IND 0x01000000 #define VIRTIO_CCW_DOING_READ_VQ_CONF 0x02000000 #define VIRTIO_CCW_DOING_SET_CONF_IND 0x04000000 +#define VIRTIO_CCW_DOING_SET_IND_ADAPTER 0x08000000 #define VIRTIO_CCW_INTPARM_MASK 0xffff0000 static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev) @@ -109,6 +138,125 @@ static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev) return container_of(vdev, struct virtio_ccw_device, vdev); } +static void drop_airq_indicator(struct virtqueue *vq, struct airq_info *info) +{ + unsigned long i, flags; + + write_lock_irqsave(&info->lock, flags); + for (i = 0; i < airq_iv_end(info->aiv); i++) { + if (vq == (void *)airq_iv_get_ptr(info->aiv, i)) { + airq_iv_free_bit(info->aiv, i); + airq_iv_set_ptr(info->aiv, i, 0); + break; + } + } + write_unlock_irqrestore(&info->lock, flags); +} + +static void virtio_airq_handler(struct airq_struct *airq) +{ + struct airq_info *info = container_of(airq, struct airq_info, airq); + unsigned long ai; + + inc_irq_stat(IRQIO_VAI); + read_lock(&info->lock); + /* Walk through indicators field, summary indicator active. */ + for (ai = 0;;) { + ai = airq_iv_scan(info->aiv, ai, airq_iv_end(info->aiv)); + if (ai == -1UL) + break; + vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai)); + } + info->summary_indicator = 0; + smp_wmb(); + /* Walk through indicators field, summary indicator not active. */ + for (ai = 0;;) { + ai = airq_iv_scan(info->aiv, ai, airq_iv_end(info->aiv)); + if (ai == -1UL) + break; + vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai)); + } + read_unlock(&info->lock); +} + +static struct airq_info *new_airq_info(void) +{ + struct airq_info *info; + int rc; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return NULL; + rwlock_init(&info->lock); + info->aiv = airq_iv_create(VIRTIO_IV_BITS, AIRQ_IV_ALLOC | AIRQ_IV_PTR); + if (!info->aiv) { + kfree(info); + return NULL; + } + info->airq.handler = virtio_airq_handler; + info->airq.lsi_ptr = &info->summary_indicator; + info->airq.lsi_mask = 0xff; + info->airq.isc = VIRTIO_AIRQ_ISC; + rc = register_adapter_interrupt(&info->airq); + if (rc) { + airq_iv_release(info->aiv); + kfree(info); + return NULL; + } + return info; +} + +static void destroy_airq_info(struct airq_info *info) +{ + if (!info) + return; + + unregister_adapter_interrupt(&info->airq); + airq_iv_release(info->aiv); + kfree(info); +} + +static unsigned long get_airq_indicator(struct virtqueue *vqs[], int nvqs, + u64 *first, void **airq_info) +{ + int i, j; + struct airq_info *info; + unsigned long indicator_addr = 0; + unsigned long bit, flags; + + for (i = 0; i < MAX_AIRQ_AREAS && !indicator_addr; i++) { + if (!airq_areas[i]) + airq_areas[i] = new_airq_info(); + info = airq_areas[i]; + if (!info) + return 0; + write_lock_irqsave(&info->lock, flags); + bit = airq_iv_alloc(info->aiv, nvqs); + if (bit == -1UL) { + /* Not enough vacancies. */ + write_unlock_irqrestore(&info->lock, flags); + continue; + } + *first = bit; + *airq_info = info; + indicator_addr = (unsigned long)info->aiv->vector; + for (j = 0; j < nvqs; j++) { + airq_iv_set_ptr(info->aiv, bit + j, + (unsigned long)vqs[j]); + } + write_unlock_irqrestore(&info->lock, flags); + } + return indicator_addr; +} + +static void virtio_ccw_drop_indicators(struct virtio_ccw_device *vcdev) +{ + struct virtio_ccw_vq_info *info; + + list_for_each_entry(info, &vcdev->virtqueues, node) + drop_airq_indicator(info->vq, vcdev->airq_info); +} + static int doing_io(struct virtio_ccw_device *vcdev, __u32 flag) { unsigned long flags; @@ -145,6 +293,51 @@ static int ccw_io_helper(struct virtio_ccw_device *vcdev, return ret ? ret : vcdev->err; } +static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev, + struct ccw1 *ccw) +{ + int ret; + unsigned long *indicatorp = NULL; + struct virtio_thinint_area *thinint_area = NULL; + struct airq_info *airq_info = vcdev->airq_info; + + if (vcdev->is_thinint) { + thinint_area = kzalloc(sizeof(*thinint_area), + GFP_DMA | GFP_KERNEL); + if (!thinint_area) + return; + thinint_area->summary_indicator = + (unsigned long) &airq_info->summary_indicator; + thinint_area->isc = VIRTIO_AIRQ_ISC; + ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER; + ccw->count = sizeof(*thinint_area); + ccw->cda = (__u32)(unsigned long) thinint_area; + } else { + indicatorp = kmalloc(sizeof(&vcdev->indicators), + GFP_DMA | GFP_KERNEL); + if (!indicatorp) + return; + *indicatorp = 0; + ccw->cmd_code = CCW_CMD_SET_IND; + ccw->count = sizeof(vcdev->indicators); + ccw->cda = (__u32)(unsigned long) indicatorp; + } + /* Deregister indicators from host. */ + vcdev->indicators = 0; + ccw->flags = 0; + ret = ccw_io_helper(vcdev, ccw, + vcdev->is_thinint ? + VIRTIO_CCW_DOING_SET_IND_ADAPTER : + VIRTIO_CCW_DOING_SET_IND); + if (ret && (ret != -ENODEV)) + dev_info(&vcdev->cdev->dev, + "Failed to deregister indicators (%d)\n", ret); + else if (vcdev->is_thinint) + virtio_ccw_drop_indicators(vcdev); + kfree(indicatorp); + kfree(thinint_area); +} + static inline long do_kvm_notify(struct subchannel_id schid, unsigned long queue_index, long cookie) @@ -232,11 +425,13 @@ static void virtio_ccw_del_vqs(struct virtio_device *vdev) { struct virtqueue *vq, *n; struct ccw1 *ccw; + struct virtio_ccw_device *vcdev = to_vc_device(vdev); ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); if (!ccw) return; + virtio_ccw_drop_indicator(vcdev, ccw); list_for_each_entry_safe(vq, n, &vdev->vqs, list) virtio_ccw_del_vq(vq, ccw); @@ -326,6 +521,54 @@ out_err: return ERR_PTR(err); } +static int virtio_ccw_register_adapter_ind(struct virtio_ccw_device *vcdev, + struct virtqueue *vqs[], int nvqs, + struct ccw1 *ccw) +{ + int ret; + struct virtio_thinint_area *thinint_area = NULL; + struct airq_info *info; + + thinint_area = kzalloc(sizeof(*thinint_area), GFP_DMA | GFP_KERNEL); + if (!thinint_area) { + ret = -ENOMEM; + goto out; + } + /* Try to get an indicator. */ + thinint_area->indicator = get_airq_indicator(vqs, nvqs, + &thinint_area->bit_nr, + &vcdev->airq_info); + if (!thinint_area->indicator) { + ret = -ENOSPC; + goto out; + } + info = vcdev->airq_info; + thinint_area->summary_indicator = + (unsigned long) &info->summary_indicator; + thinint_area->isc = VIRTIO_AIRQ_ISC; + ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER; + ccw->flags = CCW_FLAG_SLI; + ccw->count = sizeof(*thinint_area); + ccw->cda = (__u32)(unsigned long)thinint_area; + ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND_ADAPTER); + if (ret) { + if (ret == -EOPNOTSUPP) { + /* + * The host does not support adapter interrupts + * for virtio-ccw, stop trying. + */ + virtio_ccw_use_airq = 0; + pr_info("Adapter interrupts unsupported on host\n"); + } else + dev_warn(&vcdev->cdev->dev, + "enabling adapter interrupts = %d\n", ret); + virtio_ccw_drop_indicators(vcdev); + } +out: + kfree(thinint_area); + return ret; +} + static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], @@ -355,15 +598,23 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, if (!indicatorp) goto out; *indicatorp = (unsigned long) &vcdev->indicators; - /* Register queue indicators with host. */ - vcdev->indicators = 0; - ccw->cmd_code = CCW_CMD_SET_IND; - ccw->flags = 0; - ccw->count = sizeof(vcdev->indicators); - ccw->cda = (__u32)(unsigned long) indicatorp; - ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND); - if (ret) - goto out; + if (vcdev->is_thinint) { + ret = virtio_ccw_register_adapter_ind(vcdev, vqs, nvqs, ccw); + if (ret) + /* no error, just fall back to legacy interrupts */ + vcdev->is_thinint = 0; + } + if (!vcdev->is_thinint) { + /* Register queue indicators with host. */ + vcdev->indicators = 0; + ccw->cmd_code = CCW_CMD_SET_IND; + ccw->flags = 0; + ccw->count = sizeof(vcdev->indicators); + ccw->cda = (__u32)(unsigned long) indicatorp; + ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND); + if (ret) + goto out; + } /* Register indicators2 with host for config changes */ *indicatorp = (unsigned long) &vcdev->indicators2; vcdev->indicators2 = 0; @@ -636,6 +887,8 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev, struct virtqueue *vq; struct virtio_driver *drv; + if (!vcdev) + return; /* Check if it's a notification from the host. */ if ((intparm == 0) && (scsw_stctl(&irb->scsw) == @@ -663,6 +916,7 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev, case VIRTIO_CCW_DOING_SET_CONF_IND: case VIRTIO_CCW_DOING_RESET: case VIRTIO_CCW_DOING_READ_VQ_CONF: + case VIRTIO_CCW_DOING_SET_IND_ADAPTER: vcdev->curr_io &= ~activity; wake_up(&vcdev->wait_q); break; @@ -734,23 +988,37 @@ static int virtio_ccw_probe(struct ccw_device *cdev) return 0; } +static struct virtio_ccw_device *virtio_grab_drvdata(struct ccw_device *cdev) +{ + unsigned long flags; + struct virtio_ccw_device *vcdev; + + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); + vcdev = dev_get_drvdata(&cdev->dev); + if (!vcdev) { + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); + return NULL; + } + dev_set_drvdata(&cdev->dev, NULL); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); + return vcdev; +} + static void virtio_ccw_remove(struct ccw_device *cdev) { - struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev); + struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev); - if (cdev->online) { + if (vcdev && cdev->online) unregister_virtio_device(&vcdev->vdev); - dev_set_drvdata(&cdev->dev, NULL); - } cdev->handler = NULL; } static int virtio_ccw_offline(struct ccw_device *cdev) { - struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev); + struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev); - unregister_virtio_device(&vcdev->vdev); - dev_set_drvdata(&cdev->dev, NULL); + if (vcdev) + unregister_virtio_device(&vcdev->vdev); return 0; } @@ -759,6 +1027,7 @@ static int virtio_ccw_online(struct ccw_device *cdev) { int ret; struct virtio_ccw_device *vcdev; + unsigned long flags; vcdev = kzalloc(sizeof(*vcdev), GFP_KERNEL); if (!vcdev) { @@ -778,6 +1047,8 @@ static int virtio_ccw_online(struct ccw_device *cdev) goto out_free; } + vcdev->is_thinint = virtio_ccw_use_airq; /* at least try */ + vcdev->vdev.dev.parent = &cdev->dev; vcdev->vdev.dev.release = virtio_ccw_release_dev; vcdev->vdev.config = &virtio_ccw_config_ops; @@ -786,7 +1057,9 @@ static int virtio_ccw_online(struct ccw_device *cdev) INIT_LIST_HEAD(&vcdev->virtqueues); spin_lock_init(&vcdev->lock); + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); dev_set_drvdata(&cdev->dev, vcdev); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); vcdev->vdev.id.vendor = cdev->id.cu_type; vcdev->vdev.id.device = cdev->id.cu_model; ret = register_virtio_device(&vcdev->vdev); @@ -797,7 +1070,9 @@ static int virtio_ccw_online(struct ccw_device *cdev) } return 0; out_put: + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); dev_set_drvdata(&cdev->dev, NULL); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); put_device(&vcdev->vdev.dev); return ret; out_free: @@ -935,6 +1210,10 @@ module_init(virtio_ccw_init); static void __exit virtio_ccw_exit(void) { + int i; + ccw_driver_unregister(&virtio_ccw_driver); + for (i = 0; i < MAX_AIRQ_AREAS; i++) + destroy_airq_info(airq_areas[i]); } module_exit(virtio_ccw_exit); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b8e9a43e501a..9816b68b085f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -192,7 +192,7 @@ struct kvm_async_pf { void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif @@ -401,7 +401,9 @@ struct kvm { unsigned long mmu_notifier_seq; long mmu_notifier_count; #endif - long tlbs_dirty; + /* Protected by mmu_lock */ + bool tlbs_dirty; + struct list_head devices; }; @@ -1064,6 +1066,7 @@ extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; extern struct kvm_device_ops kvm_vfio_ops; extern struct kvm_device_ops kvm_arm_vgic_v2_ops; +extern struct kvm_device_ops kvm_flic_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 932d7f2637d6..7d76401d2bb5 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -413,6 +413,8 @@ struct kvm_s390_psw { #define KVM_S390_PROGRAM_INT 0xfffe0001u #define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u #define KVM_S390_RESTART 0xfffe0003u +#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u +#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u #define KVM_S390_MCHK 0xfffe1000u #define KVM_S390_INT_VIRTIO 0xffff2603u #define KVM_S390_INT_SERVICE 0xffff2401u @@ -434,6 +436,69 @@ struct kvm_s390_interrupt { __u64 parm64; }; +struct kvm_s390_io_info { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; +}; + +struct kvm_s390_ext_info { + __u32 ext_params; + __u32 pad; + __u64 ext_params2; +}; + +struct kvm_s390_pgm_info { + __u64 trans_exc_code; + __u64 mon_code; + __u64 per_address; + __u32 data_exc_code; + __u16 code; + __u16 mon_class_nr; + __u8 per_code; + __u8 per_atmid; + __u8 exc_access_id; + __u8 per_access_id; + __u8 op_access_id; + __u8 pad[3]; +}; + +struct kvm_s390_prefix_info { + __u32 address; +}; + +struct kvm_s390_extcall_info { + __u16 code; +}; + +struct kvm_s390_emerg_info { + __u16 code; +}; + +struct kvm_s390_mchk_info { + __u64 cr14; + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 pad; + __u8 fixed_logout[16]; +}; + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + /* for KVM_SET_GUEST_DEBUG */ #define KVM_GUESTDBG_ENABLE 0x00000001 @@ -855,6 +920,7 @@ struct kvm_device_attr { #define KVM_DEV_VFIO_GROUP_ADD 1 #define KVM_DEV_VFIO_GROUP_DEL 2 #define KVM_DEV_TYPE_ARM_VGIC_V2 5 +#define KVM_DEV_TYPE_FLIC 6 /* * ioctls for VM fds diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index fbe1a48bd629..13f2d19793e3 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -22,6 +22,10 @@ config KVM_MMIO config KVM_ASYNC_PF bool +# Toggle to switch between direct notification and batch job +config KVM_ASYNC_PF_SYNC + bool + config HAVE_KVM_MSI bool diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 8631d9c14320..10df100c4514 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -28,6 +28,21 @@ #include "async_pf.h" #include <trace/events/kvm.h> +static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ +#ifdef CONFIG_KVM_ASYNC_PF_SYNC + kvm_arch_async_page_present(vcpu, work); +#endif +} +static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ +#ifndef CONFIG_KVM_ASYNC_PF_SYNC + kvm_arch_async_page_present(vcpu, work); +#endif +} + static struct kmem_cache *async_pf_cache; int kvm_async_pf_init(void) @@ -69,6 +84,7 @@ static void async_pf_execute(struct work_struct *work) down_read(&mm->mmap_sem); get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL); up_read(&mm->mmap_sem); + kvm_async_page_present_sync(vcpu, apf); unuse_mm(mm); spin_lock(&vcpu->async_pf.lock); @@ -97,11 +113,16 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) list_entry(vcpu->async_pf.queue.next, typeof(*work), queue); list_del(&work->queue); + +#ifdef CONFIG_KVM_ASYNC_PF_SYNC + flush_work(&work->work); +#else if (cancel_work_sync(&work->work)) { mmdrop(work->mm); kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ kmem_cache_free(async_pf_cache, work); } +#endif } spin_lock(&vcpu->async_pf.lock); @@ -130,7 +151,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->async_pf.lock); kvm_arch_async_page_ready(vcpu, work); - kvm_arch_async_page_present(vcpu, work); + kvm_async_page_present_async(vcpu, work); list_del(&work->queue); vcpu->async_pf.queued--; @@ -138,7 +159,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) } } -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, struct kvm_arch_async_pf *arch) { struct kvm_async_pf *work; @@ -159,7 +180,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, work->wakeup_all = false; work->vcpu = vcpu; work->gva = gva; - work->addr = gfn_to_hva(vcpu->kvm, gfn); + work->addr = hva; work->arch = *arch; work->mm = current->mm; atomic_inc(&work->mm->mm_count); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 03a0381b1cb7..5fd4cf8e8888 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -186,12 +186,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) void kvm_flush_remote_tlbs(struct kvm *kvm) { - long dirty_count = kvm->tlbs_dirty; - - smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; - cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); + kvm->tlbs_dirty = false; } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); @@ -1804,7 +1801,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) continue; if (vcpu == me) continue; - if (waitqueue_active(&vcpu->wq)) + if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) continue; @@ -2284,6 +2281,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm, ops = &kvm_arm_vgic_v2_ops; break; #endif +#ifdef CONFIG_S390 + case KVM_DEV_TYPE_FLIC: + ops = &kvm_flic_ops; + break; +#endif default: return -ENODEV; } |