summaryrefslogtreecommitdiff
path: root/virt/kvm/kvm_main.c
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r--virt/kvm/kvm_main.c173
1 files changed, 154 insertions, 19 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 60de4c337f0a..4e4bb5dd2dcd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -49,6 +49,7 @@
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/bsearch.h>
+#include <linux/kthread.h>
#include <asm/processor.h>
#include <asm/io.h>
@@ -87,7 +88,7 @@ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
*/
-DEFINE_SPINLOCK(kvm_lock);
+DEFINE_MUTEX(kvm_lock);
static DEFINE_RAW_SPINLOCK(kvm_count_lock);
LIST_HEAD(vm_list);
@@ -130,10 +131,30 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
{
}
+bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+{
+ /*
+ * The metadata used by is_zone_device_page() to determine whether or
+ * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
+ * the device has been pinned, e.g. by get_user_pages(). WARN if the
+ * page_count() is zero to help detect bad usage of this helper.
+ */
+ if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+ return false;
+
+ return is_zone_device_page(pfn_to_page(pfn));
+}
+
bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
{
+ /*
+ * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
+ * perspective they are "normal" pages, albeit with slightly different
+ * usage rules.
+ */
if (pfn_valid(pfn))
- return PageReserved(pfn_to_page(pfn));
+ return PageReserved(pfn_to_page(pfn)) &&
+ !kvm_is_zone_device_pfn(pfn);
return true;
}
@@ -612,6 +633,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
return 0;
}
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return 0;
+}
+
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
static struct kvm *kvm_create_vm(unsigned long type)
{
int r, i;
@@ -659,22 +697,31 @@ static struct kvm *kvm_create_vm(unsigned long type)
kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
GFP_KERNEL);
if (!kvm->buses[i])
- goto out_err;
+ goto out_err_no_mmu_notifier;
}
r = kvm_init_mmu_notifier(kvm);
if (r)
+ goto out_err_no_mmu_notifier;
+
+ r = kvm_arch_post_init_vm(kvm);
+ if (r)
goto out_err;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
preempt_notifier_inc();
return kvm;
out_err:
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+ if (kvm->mmu_notifier.ops)
+ mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
cleanup_srcu_struct(&kvm->irq_srcu);
out_err_no_irq_srcu:
cleanup_srcu_struct(&kvm->srcu);
@@ -724,9 +771,11 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_destroy_vm_debugfs(kvm);
kvm_arch_sync_events(kvm);
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_del(&kvm->vm_list);
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
+ kvm_arch_pre_destroy_vm(kvm);
+
kvm_free_irq_routing(kvm);
for (i = 0; i < KVM_NR_BUSES; i++) {
if (kvm->buses[i])
@@ -1729,7 +1778,7 @@ static void kvm_release_pfn_dirty(kvm_pfn_t pfn)
void kvm_set_pfn_dirty(kvm_pfn_t pfn)
{
- if (!kvm_is_reserved_pfn(pfn)) {
+ if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
struct page *page = pfn_to_page(pfn);
if (!PageReserved(page))
@@ -1740,7 +1789,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
void kvm_set_pfn_accessed(kvm_pfn_t pfn)
{
- if (!kvm_is_reserved_pfn(pfn))
+ if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
mark_page_accessed(pfn_to_page(pfn));
}
EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
@@ -1996,12 +2045,12 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
if (slots->generation != ghc->generation)
kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
- if (unlikely(!ghc->memslot))
- return kvm_write_guest(kvm, ghc->gpa, data, len);
-
if (kvm_is_error_hva(ghc->hva))
return -EFAULT;
+ if (unlikely(!ghc->memslot))
+ return kvm_write_guest(kvm, ghc->gpa, data, len);
+
r = __copy_to_user((void __user *)ghc->hva, data, len);
if (r)
return -EFAULT;
@@ -2022,12 +2071,12 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
if (slots->generation != ghc->generation)
kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
- if (unlikely(!ghc->memslot))
- return kvm_read_guest(kvm, ghc->gpa, data, len);
-
if (kvm_is_error_hva(ghc->hva))
return -EFAULT;
+ if (unlikely(!ghc->memslot))
+ return kvm_read_guest(kvm, ghc->gpa, data, len);
+
r = __copy_from_user(data, (void __user *)ghc->hva, len);
if (r)
return -EFAULT;
@@ -2793,6 +2842,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
{
struct kvm_device *dev = filp->private_data;
+ if (dev->kvm->mm != current->mm)
+ return -EIO;
+
switch (ioctl) {
case KVM_SET_DEVICE_ATTR:
return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
@@ -3749,13 +3801,13 @@ static int vm_stat_get(void *_offset, u64 *val)
u64 tmp_val;
*val = 0;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
stat_tmp.kvm = kvm;
vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
*val += tmp_val;
}
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
return 0;
}
@@ -3769,13 +3821,13 @@ static int vcpu_stat_get(void *_offset, u64 *val)
u64 tmp_val;
*val = 0;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
stat_tmp.kvm = kvm;
vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
*val += tmp_val;
}
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
return 0;
}
@@ -3984,3 +4036,86 @@ void kvm_exit(void)
kvm_vfio_ops_exit();
}
EXPORT_SYMBOL_GPL(kvm_exit);
+
+struct kvm_vm_worker_thread_context {
+ struct kvm *kvm;
+ struct task_struct *parent;
+ struct completion init_done;
+ kvm_vm_thread_fn_t thread_fn;
+ uintptr_t data;
+ int err;
+};
+
+static int kvm_vm_worker_thread(void *context)
+{
+ /*
+ * The init_context is allocated on the stack of the parent thread, so
+ * we have to locally copy anything that is needed beyond initialization
+ */
+ struct kvm_vm_worker_thread_context *init_context = context;
+ struct kvm *kvm = init_context->kvm;
+ kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
+ uintptr_t data = init_context->data;
+ int err;
+
+ err = kthread_park(current);
+ /* kthread_park(current) is never supposed to return an error */
+ WARN_ON(err != 0);
+ if (err)
+ goto init_complete;
+
+ err = cgroup_attach_task_all(init_context->parent, current);
+ if (err) {
+ kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
+ __func__, err);
+ goto init_complete;
+ }
+
+ set_user_nice(current, task_nice(init_context->parent));
+
+init_complete:
+ init_context->err = err;
+ complete(&init_context->init_done);
+ init_context = NULL;
+
+ if (err)
+ return err;
+
+ /* Wait to be woken up by the spawner before proceeding. */
+ kthread_parkme();
+
+ if (!kthread_should_stop())
+ err = thread_fn(kvm, data);
+
+ return err;
+}
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+ uintptr_t data, const char *name,
+ struct task_struct **thread_ptr)
+{
+ struct kvm_vm_worker_thread_context init_context = {};
+ struct task_struct *thread;
+
+ *thread_ptr = NULL;
+ init_context.kvm = kvm;
+ init_context.parent = current;
+ init_context.thread_fn = thread_fn;
+ init_context.data = data;
+ init_completion(&init_context.init_done);
+
+ thread = kthread_run(kvm_vm_worker_thread, &init_context,
+ "%s-%d", name, task_pid_nr(current));
+ if (IS_ERR(thread))
+ return PTR_ERR(thread);
+
+ /* kthread_run is never supposed to return NULL */
+ WARN_ON(thread == NULL);
+
+ wait_for_completion(&init_context.init_done);
+
+ if (!init_context.err)
+ *thread_ptr = thread;
+
+ return init_context.err;
+}