1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
18 #include <linux/fs.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
35 #include "vfio.h"
36
37 #define DRIVER_VERSION "0.3"
38 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
39 #define DRIVER_DESC "VFIO - User Level meta-driver"
40
41 static struct vfio {
42 struct class *class;
43 struct list_head iommu_drivers_list;
44 struct mutex iommu_drivers_lock;
45 struct list_head group_list;
46 struct mutex group_lock; /* locks group_list */
47 struct ida group_ida;
48 dev_t group_devt;
49 } vfio;
50
51 struct vfio_iommu_driver {
52 const struct vfio_iommu_driver_ops *ops;
53 struct list_head vfio_next;
54 };
55
56 struct vfio_container {
57 struct kref kref;
58 struct list_head group_list;
59 struct rw_semaphore group_lock;
60 struct vfio_iommu_driver *iommu_driver;
61 void *iommu_data;
62 bool noiommu;
63 };
64
65 struct vfio_group {
66 struct device dev;
67 struct cdev cdev;
68 refcount_t users;
69 unsigned int container_users;
70 struct iommu_group *iommu_group;
71 struct vfio_container *container;
72 struct list_head device_list;
73 struct mutex device_lock;
74 struct list_head vfio_next;
75 struct list_head container_next;
76 enum vfio_group_type type;
77 unsigned int dev_counter;
78 struct rw_semaphore group_rwsem;
79 struct kvm *kvm;
80 struct file *opened_file;
81 struct blocking_notifier_head notifier;
82 };
83
84 #ifdef CONFIG_VFIO_NOIOMMU
85 static bool noiommu __read_mostly;
86 module_param_named(enable_unsafe_noiommu_mode,
87 noiommu, bool, S_IRUGO | S_IWUSR);
88 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
89 #endif
90
91 static DEFINE_XARRAY(vfio_device_set_xa);
92 static const struct file_operations vfio_group_fops;
93
vfio_assign_device_set(struct vfio_device * device,void * set_id)94 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
95 {
96 unsigned long idx = (unsigned long)set_id;
97 struct vfio_device_set *new_dev_set;
98 struct vfio_device_set *dev_set;
99
100 if (WARN_ON(!set_id))
101 return -EINVAL;
102
103 /*
104 * Atomically acquire a singleton object in the xarray for this set_id
105 */
106 xa_lock(&vfio_device_set_xa);
107 dev_set = xa_load(&vfio_device_set_xa, idx);
108 if (dev_set)
109 goto found_get_ref;
110 xa_unlock(&vfio_device_set_xa);
111
112 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
113 if (!new_dev_set)
114 return -ENOMEM;
115 mutex_init(&new_dev_set->lock);
116 INIT_LIST_HEAD(&new_dev_set->device_list);
117 new_dev_set->set_id = set_id;
118
119 xa_lock(&vfio_device_set_xa);
120 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
121 GFP_KERNEL);
122 if (!dev_set) {
123 dev_set = new_dev_set;
124 goto found_get_ref;
125 }
126
127 kfree(new_dev_set);
128 if (xa_is_err(dev_set)) {
129 xa_unlock(&vfio_device_set_xa);
130 return xa_err(dev_set);
131 }
132
133 found_get_ref:
134 dev_set->device_count++;
135 xa_unlock(&vfio_device_set_xa);
136 mutex_lock(&dev_set->lock);
137 device->dev_set = dev_set;
138 list_add_tail(&device->dev_set_list, &dev_set->device_list);
139 mutex_unlock(&dev_set->lock);
140 return 0;
141 }
142 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
143
vfio_release_device_set(struct vfio_device * device)144 static void vfio_release_device_set(struct vfio_device *device)
145 {
146 struct vfio_device_set *dev_set = device->dev_set;
147
148 if (!dev_set)
149 return;
150
151 mutex_lock(&dev_set->lock);
152 list_del(&device->dev_set_list);
153 mutex_unlock(&dev_set->lock);
154
155 xa_lock(&vfio_device_set_xa);
156 if (!--dev_set->device_count) {
157 __xa_erase(&vfio_device_set_xa,
158 (unsigned long)dev_set->set_id);
159 mutex_destroy(&dev_set->lock);
160 kfree(dev_set);
161 }
162 xa_unlock(&vfio_device_set_xa);
163 }
164
165 #ifdef CONFIG_VFIO_NOIOMMU
vfio_noiommu_open(unsigned long arg)166 static void *vfio_noiommu_open(unsigned long arg)
167 {
168 if (arg != VFIO_NOIOMMU_IOMMU)
169 return ERR_PTR(-EINVAL);
170 if (!capable(CAP_SYS_RAWIO))
171 return ERR_PTR(-EPERM);
172
173 return NULL;
174 }
175
vfio_noiommu_release(void * iommu_data)176 static void vfio_noiommu_release(void *iommu_data)
177 {
178 }
179
vfio_noiommu_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)180 static long vfio_noiommu_ioctl(void *iommu_data,
181 unsigned int cmd, unsigned long arg)
182 {
183 if (cmd == VFIO_CHECK_EXTENSION)
184 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
185
186 return -ENOTTY;
187 }
188
vfio_noiommu_attach_group(void * iommu_data,struct iommu_group * iommu_group,enum vfio_group_type type)189 static int vfio_noiommu_attach_group(void *iommu_data,
190 struct iommu_group *iommu_group, enum vfio_group_type type)
191 {
192 return 0;
193 }
194
vfio_noiommu_detach_group(void * iommu_data,struct iommu_group * iommu_group)195 static void vfio_noiommu_detach_group(void *iommu_data,
196 struct iommu_group *iommu_group)
197 {
198 }
199
200 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
201 .name = "vfio-noiommu",
202 .owner = THIS_MODULE,
203 .open = vfio_noiommu_open,
204 .release = vfio_noiommu_release,
205 .ioctl = vfio_noiommu_ioctl,
206 .attach_group = vfio_noiommu_attach_group,
207 .detach_group = vfio_noiommu_detach_group,
208 };
209
210 /*
211 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
212 * use vfio-noiommu.
213 */
vfio_iommu_driver_allowed(struct vfio_container * container,const struct vfio_iommu_driver * driver)214 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
215 const struct vfio_iommu_driver *driver)
216 {
217 return container->noiommu == (driver->ops == &vfio_noiommu_ops);
218 }
219 #else
vfio_iommu_driver_allowed(struct vfio_container * container,const struct vfio_iommu_driver * driver)220 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
221 const struct vfio_iommu_driver *driver)
222 {
223 return true;
224 }
225 #endif /* CONFIG_VFIO_NOIOMMU */
226
227 /*
228 * IOMMU driver registration
229 */
vfio_register_iommu_driver(const struct vfio_iommu_driver_ops * ops)230 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
231 {
232 struct vfio_iommu_driver *driver, *tmp;
233
234 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
235 if (!driver)
236 return -ENOMEM;
237
238 driver->ops = ops;
239
240 mutex_lock(&vfio.iommu_drivers_lock);
241
242 /* Check for duplicates */
243 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
244 if (tmp->ops == ops) {
245 mutex_unlock(&vfio.iommu_drivers_lock);
246 kfree(driver);
247 return -EINVAL;
248 }
249 }
250
251 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
252
253 mutex_unlock(&vfio.iommu_drivers_lock);
254
255 return 0;
256 }
257 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
258
vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops * ops)259 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
260 {
261 struct vfio_iommu_driver *driver;
262
263 mutex_lock(&vfio.iommu_drivers_lock);
264 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
265 if (driver->ops == ops) {
266 list_del(&driver->vfio_next);
267 mutex_unlock(&vfio.iommu_drivers_lock);
268 kfree(driver);
269 return;
270 }
271 }
272 mutex_unlock(&vfio.iommu_drivers_lock);
273 }
274 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
275
276 static void vfio_group_get(struct vfio_group *group);
277
278 /*
279 * Container objects - containers are created when /dev/vfio/vfio is
280 * opened, but their lifecycle extends until the last user is done, so
281 * it's freed via kref. Must support container/group/device being
282 * closed in any order.
283 */
vfio_container_get(struct vfio_container * container)284 static void vfio_container_get(struct vfio_container *container)
285 {
286 kref_get(&container->kref);
287 }
288
vfio_container_release(struct kref * kref)289 static void vfio_container_release(struct kref *kref)
290 {
291 struct vfio_container *container;
292 container = container_of(kref, struct vfio_container, kref);
293
294 kfree(container);
295 }
296
vfio_container_put(struct vfio_container * container)297 static void vfio_container_put(struct vfio_container *container)
298 {
299 kref_put(&container->kref, vfio_container_release);
300 }
301
302 /*
303 * Group objects - create, release, get, put, search
304 */
305 static struct vfio_group *
__vfio_group_get_from_iommu(struct iommu_group * iommu_group)306 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
307 {
308 struct vfio_group *group;
309
310 list_for_each_entry(group, &vfio.group_list, vfio_next) {
311 if (group->iommu_group == iommu_group) {
312 vfio_group_get(group);
313 return group;
314 }
315 }
316 return NULL;
317 }
318
319 static struct vfio_group *
vfio_group_get_from_iommu(struct iommu_group * iommu_group)320 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
321 {
322 struct vfio_group *group;
323
324 mutex_lock(&vfio.group_lock);
325 group = __vfio_group_get_from_iommu(iommu_group);
326 mutex_unlock(&vfio.group_lock);
327 return group;
328 }
329
vfio_group_release(struct device * dev)330 static void vfio_group_release(struct device *dev)
331 {
332 struct vfio_group *group = container_of(dev, struct vfio_group, dev);
333
334 mutex_destroy(&group->device_lock);
335 iommu_group_put(group->iommu_group);
336 ida_free(&vfio.group_ida, MINOR(group->dev.devt));
337 kfree(group);
338 }
339
vfio_group_alloc(struct iommu_group * iommu_group,enum vfio_group_type type)340 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
341 enum vfio_group_type type)
342 {
343 struct vfio_group *group;
344 int minor;
345
346 group = kzalloc(sizeof(*group), GFP_KERNEL);
347 if (!group)
348 return ERR_PTR(-ENOMEM);
349
350 minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
351 if (minor < 0) {
352 kfree(group);
353 return ERR_PTR(minor);
354 }
355
356 device_initialize(&group->dev);
357 group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
358 group->dev.class = vfio.class;
359 group->dev.release = vfio_group_release;
360 cdev_init(&group->cdev, &vfio_group_fops);
361 group->cdev.owner = THIS_MODULE;
362
363 refcount_set(&group->users, 1);
364 init_rwsem(&group->group_rwsem);
365 INIT_LIST_HEAD(&group->device_list);
366 mutex_init(&group->device_lock);
367 group->iommu_group = iommu_group;
368 /* put in vfio_group_release() */
369 iommu_group_ref_get(iommu_group);
370 group->type = type;
371 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
372
373 return group;
374 }
375
vfio_create_group(struct iommu_group * iommu_group,enum vfio_group_type type)376 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
377 enum vfio_group_type type)
378 {
379 struct vfio_group *group;
380 struct vfio_group *ret;
381 int err;
382
383 group = vfio_group_alloc(iommu_group, type);
384 if (IS_ERR(group))
385 return group;
386
387 err = dev_set_name(&group->dev, "%s%d",
388 group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
389 iommu_group_id(iommu_group));
390 if (err) {
391 ret = ERR_PTR(err);
392 goto err_put;
393 }
394
395 mutex_lock(&vfio.group_lock);
396
397 /* Did we race creating this group? */
398 ret = __vfio_group_get_from_iommu(iommu_group);
399 if (ret)
400 goto err_unlock;
401
402 err = cdev_device_add(&group->cdev, &group->dev);
403 if (err) {
404 ret = ERR_PTR(err);
405 goto err_unlock;
406 }
407
408 list_add(&group->vfio_next, &vfio.group_list);
409
410 mutex_unlock(&vfio.group_lock);
411 return group;
412
413 err_unlock:
414 mutex_unlock(&vfio.group_lock);
415 err_put:
416 put_device(&group->dev);
417 return ret;
418 }
419
vfio_group_put(struct vfio_group * group)420 static void vfio_group_put(struct vfio_group *group)
421 {
422 if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
423 return;
424
425 /*
426 * These data structures all have paired operations that can only be
427 * undone when the caller holds a live reference on the group. Since all
428 * pairs must be undone these WARN_ON's indicate some caller did not
429 * properly hold the group reference.
430 */
431 WARN_ON(!list_empty(&group->device_list));
432 WARN_ON(group->container || group->container_users);
433 WARN_ON(group->notifier.head);
434
435 list_del(&group->vfio_next);
436 cdev_device_del(&group->cdev, &group->dev);
437 mutex_unlock(&vfio.group_lock);
438
439 put_device(&group->dev);
440 }
441
vfio_group_get(struct vfio_group * group)442 static void vfio_group_get(struct vfio_group *group)
443 {
444 refcount_inc(&group->users);
445 }
446
447 /*
448 * Device objects - create, release, get, put, search
449 */
450 /* Device reference always implies a group reference */
vfio_device_put(struct vfio_device * device)451 static void vfio_device_put(struct vfio_device *device)
452 {
453 if (refcount_dec_and_test(&device->refcount))
454 complete(&device->comp);
455 }
456
vfio_device_try_get(struct vfio_device * device)457 static bool vfio_device_try_get(struct vfio_device *device)
458 {
459 return refcount_inc_not_zero(&device->refcount);
460 }
461
vfio_group_get_device(struct vfio_group * group,struct device * dev)462 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
463 struct device *dev)
464 {
465 struct vfio_device *device;
466
467 mutex_lock(&group->device_lock);
468 list_for_each_entry(device, &group->device_list, group_next) {
469 if (device->dev == dev && vfio_device_try_get(device)) {
470 mutex_unlock(&group->device_lock);
471 return device;
472 }
473 }
474 mutex_unlock(&group->device_lock);
475 return NULL;
476 }
477
478 /*
479 * VFIO driver API
480 */
vfio_init_group_dev(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)481 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
482 const struct vfio_device_ops *ops)
483 {
484 init_completion(&device->comp);
485 device->dev = dev;
486 device->ops = ops;
487 }
488 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
489
vfio_uninit_group_dev(struct vfio_device * device)490 void vfio_uninit_group_dev(struct vfio_device *device)
491 {
492 vfio_release_device_set(device);
493 }
494 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
495
vfio_noiommu_group_alloc(struct device * dev,enum vfio_group_type type)496 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
497 enum vfio_group_type type)
498 {
499 struct iommu_group *iommu_group;
500 struct vfio_group *group;
501 int ret;
502
503 iommu_group = iommu_group_alloc();
504 if (IS_ERR(iommu_group))
505 return ERR_CAST(iommu_group);
506
507 iommu_group_set_name(iommu_group, "vfio-noiommu");
508 ret = iommu_group_add_device(iommu_group, dev);
509 if (ret)
510 goto out_put_group;
511
512 group = vfio_create_group(iommu_group, type);
513 if (IS_ERR(group)) {
514 ret = PTR_ERR(group);
515 goto out_remove_device;
516 }
517 iommu_group_put(iommu_group);
518 return group;
519
520 out_remove_device:
521 iommu_group_remove_device(dev);
522 out_put_group:
523 iommu_group_put(iommu_group);
524 return ERR_PTR(ret);
525 }
526
vfio_group_find_or_alloc(struct device * dev)527 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
528 {
529 struct iommu_group *iommu_group;
530 struct vfio_group *group;
531
532 iommu_group = iommu_group_get(dev);
533 #ifdef CONFIG_VFIO_NOIOMMU
534 if (!iommu_group && noiommu) {
535 /*
536 * With noiommu enabled, create an IOMMU group for devices that
537 * don't already have one, implying no IOMMU hardware/driver
538 * exists. Taint the kernel because we're about to give a DMA
539 * capable device to a user without IOMMU protection.
540 */
541 group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
542 if (!IS_ERR(group)) {
543 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
544 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
545 }
546 return group;
547 }
548 #endif
549 if (!iommu_group)
550 return ERR_PTR(-EINVAL);
551
552 /*
553 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
554 * restore cache coherency. It has to be checked here because it is only
555 * valid for cases where we are using iommu groups.
556 */
557 if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) {
558 iommu_group_put(iommu_group);
559 return ERR_PTR(-EINVAL);
560 }
561
562 group = vfio_group_get_from_iommu(iommu_group);
563 if (!group)
564 group = vfio_create_group(iommu_group, VFIO_IOMMU);
565
566 /* The vfio_group holds a reference to the iommu_group */
567 iommu_group_put(iommu_group);
568 return group;
569 }
570
__vfio_register_dev(struct vfio_device * device,struct vfio_group * group)571 static int __vfio_register_dev(struct vfio_device *device,
572 struct vfio_group *group)
573 {
574 struct vfio_device *existing_device;
575
576 if (IS_ERR(group))
577 return PTR_ERR(group);
578
579 /*
580 * If the driver doesn't specify a set then the device is added to a
581 * singleton set just for itself.
582 */
583 if (!device->dev_set)
584 vfio_assign_device_set(device, device);
585
586 existing_device = vfio_group_get_device(group, device->dev);
587 if (existing_device) {
588 dev_WARN(device->dev, "Device already exists on group %d\n",
589 iommu_group_id(group->iommu_group));
590 vfio_device_put(existing_device);
591 if (group->type == VFIO_NO_IOMMU ||
592 group->type == VFIO_EMULATED_IOMMU)
593 iommu_group_remove_device(device->dev);
594 vfio_group_put(group);
595 return -EBUSY;
596 }
597
598 /* Our reference on group is moved to the device */
599 device->group = group;
600
601 /* Refcounting can't start until the driver calls register */
602 refcount_set(&device->refcount, 1);
603
604 mutex_lock(&group->device_lock);
605 list_add(&device->group_next, &group->device_list);
606 group->dev_counter++;
607 mutex_unlock(&group->device_lock);
608
609 return 0;
610 }
611
vfio_register_group_dev(struct vfio_device * device)612 int vfio_register_group_dev(struct vfio_device *device)
613 {
614 return __vfio_register_dev(device,
615 vfio_group_find_or_alloc(device->dev));
616 }
617 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
618
619 /*
620 * Register a virtual device without IOMMU backing. The user of this
621 * device must not be able to directly trigger unmediated DMA.
622 */
vfio_register_emulated_iommu_dev(struct vfio_device * device)623 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
624 {
625 return __vfio_register_dev(device,
626 vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
627 }
628 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
629
vfio_device_get_from_name(struct vfio_group * group,char * buf)630 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
631 char *buf)
632 {
633 struct vfio_device *it, *device = ERR_PTR(-ENODEV);
634
635 mutex_lock(&group->device_lock);
636 list_for_each_entry(it, &group->device_list, group_next) {
637 int ret;
638
639 if (it->ops->match) {
640 ret = it->ops->match(it, buf);
641 if (ret < 0) {
642 device = ERR_PTR(ret);
643 break;
644 }
645 } else {
646 ret = !strcmp(dev_name(it->dev), buf);
647 }
648
649 if (ret && vfio_device_try_get(it)) {
650 device = it;
651 break;
652 }
653 }
654 mutex_unlock(&group->device_lock);
655
656 return device;
657 }
658
659 /*
660 * Decrement the device reference count and wait for the device to be
661 * removed. Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)662 void vfio_unregister_group_dev(struct vfio_device *device)
663 {
664 struct vfio_group *group = device->group;
665 unsigned int i = 0;
666 bool interrupted = false;
667 long rc;
668
669 vfio_device_put(device);
670 rc = try_wait_for_completion(&device->comp);
671 while (rc <= 0) {
672 if (device->ops->request)
673 device->ops->request(device, i++);
674
675 if (interrupted) {
676 rc = wait_for_completion_timeout(&device->comp,
677 HZ * 10);
678 } else {
679 rc = wait_for_completion_interruptible_timeout(
680 &device->comp, HZ * 10);
681 if (rc < 0) {
682 interrupted = true;
683 dev_warn(device->dev,
684 "Device is currently in use, task"
685 " \"%s\" (%d) "
686 "blocked until device is released",
687 current->comm, task_pid_nr(current));
688 }
689 }
690 }
691
692 mutex_lock(&group->device_lock);
693 list_del(&device->group_next);
694 group->dev_counter--;
695 mutex_unlock(&group->device_lock);
696
697 if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
698 iommu_group_remove_device(device->dev);
699
700 /* Matches the get in vfio_register_group_dev() */
701 vfio_group_put(group);
702 }
703 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
704
705 /*
706 * VFIO base fd, /dev/vfio/vfio
707 */
vfio_ioctl_check_extension(struct vfio_container * container,unsigned long arg)708 static long vfio_ioctl_check_extension(struct vfio_container *container,
709 unsigned long arg)
710 {
711 struct vfio_iommu_driver *driver;
712 long ret = 0;
713
714 down_read(&container->group_lock);
715
716 driver = container->iommu_driver;
717
718 switch (arg) {
719 /* No base extensions yet */
720 default:
721 /*
722 * If no driver is set, poll all registered drivers for
723 * extensions and return the first positive result. If
724 * a driver is already set, further queries will be passed
725 * only to that driver.
726 */
727 if (!driver) {
728 mutex_lock(&vfio.iommu_drivers_lock);
729 list_for_each_entry(driver, &vfio.iommu_drivers_list,
730 vfio_next) {
731
732 if (!list_empty(&container->group_list) &&
733 !vfio_iommu_driver_allowed(container,
734 driver))
735 continue;
736 if (!try_module_get(driver->ops->owner))
737 continue;
738
739 ret = driver->ops->ioctl(NULL,
740 VFIO_CHECK_EXTENSION,
741 arg);
742 module_put(driver->ops->owner);
743 if (ret > 0)
744 break;
745 }
746 mutex_unlock(&vfio.iommu_drivers_lock);
747 } else
748 ret = driver->ops->ioctl(container->iommu_data,
749 VFIO_CHECK_EXTENSION, arg);
750 }
751
752 up_read(&container->group_lock);
753
754 return ret;
755 }
756
757 /* hold write lock on container->group_lock */
__vfio_container_attach_groups(struct vfio_container * container,struct vfio_iommu_driver * driver,void * data)758 static int __vfio_container_attach_groups(struct vfio_container *container,
759 struct vfio_iommu_driver *driver,
760 void *data)
761 {
762 struct vfio_group *group;
763 int ret = -ENODEV;
764
765 list_for_each_entry(group, &container->group_list, container_next) {
766 ret = driver->ops->attach_group(data, group->iommu_group,
767 group->type);
768 if (ret)
769 goto unwind;
770 }
771
772 return ret;
773
774 unwind:
775 list_for_each_entry_continue_reverse(group, &container->group_list,
776 container_next) {
777 driver->ops->detach_group(data, group->iommu_group);
778 }
779
780 return ret;
781 }
782
vfio_ioctl_set_iommu(struct vfio_container * container,unsigned long arg)783 static long vfio_ioctl_set_iommu(struct vfio_container *container,
784 unsigned long arg)
785 {
786 struct vfio_iommu_driver *driver;
787 long ret = -ENODEV;
788
789 down_write(&container->group_lock);
790
791 /*
792 * The container is designed to be an unprivileged interface while
793 * the group can be assigned to specific users. Therefore, only by
794 * adding a group to a container does the user get the privilege of
795 * enabling the iommu, which may allocate finite resources. There
796 * is no unset_iommu, but by removing all the groups from a container,
797 * the container is deprivileged and returns to an unset state.
798 */
799 if (list_empty(&container->group_list) || container->iommu_driver) {
800 up_write(&container->group_lock);
801 return -EINVAL;
802 }
803
804 mutex_lock(&vfio.iommu_drivers_lock);
805 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
806 void *data;
807
808 if (!vfio_iommu_driver_allowed(container, driver))
809 continue;
810 if (!try_module_get(driver->ops->owner))
811 continue;
812
813 /*
814 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
815 * so test which iommu driver reported support for this
816 * extension and call open on them. We also pass them the
817 * magic, allowing a single driver to support multiple
818 * interfaces if they'd like.
819 */
820 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
821 module_put(driver->ops->owner);
822 continue;
823 }
824
825 data = driver->ops->open(arg);
826 if (IS_ERR(data)) {
827 ret = PTR_ERR(data);
828 module_put(driver->ops->owner);
829 continue;
830 }
831
832 ret = __vfio_container_attach_groups(container, driver, data);
833 if (ret) {
834 driver->ops->release(data);
835 module_put(driver->ops->owner);
836 continue;
837 }
838
839 container->iommu_driver = driver;
840 container->iommu_data = data;
841 break;
842 }
843
844 mutex_unlock(&vfio.iommu_drivers_lock);
845 up_write(&container->group_lock);
846
847 return ret;
848 }
849
vfio_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)850 static long vfio_fops_unl_ioctl(struct file *filep,
851 unsigned int cmd, unsigned long arg)
852 {
853 struct vfio_container *container = filep->private_data;
854 struct vfio_iommu_driver *driver;
855 void *data;
856 long ret = -EINVAL;
857
858 if (!container)
859 return ret;
860
861 switch (cmd) {
862 case VFIO_GET_API_VERSION:
863 ret = VFIO_API_VERSION;
864 break;
865 case VFIO_CHECK_EXTENSION:
866 ret = vfio_ioctl_check_extension(container, arg);
867 break;
868 case VFIO_SET_IOMMU:
869 ret = vfio_ioctl_set_iommu(container, arg);
870 break;
871 default:
872 driver = container->iommu_driver;
873 data = container->iommu_data;
874
875 if (driver) /* passthrough all unrecognized ioctls */
876 ret = driver->ops->ioctl(data, cmd, arg);
877 }
878
879 return ret;
880 }
881
vfio_fops_open(struct inode * inode,struct file * filep)882 static int vfio_fops_open(struct inode *inode, struct file *filep)
883 {
884 struct vfio_container *container;
885
886 container = kzalloc(sizeof(*container), GFP_KERNEL);
887 if (!container)
888 return -ENOMEM;
889
890 INIT_LIST_HEAD(&container->group_list);
891 init_rwsem(&container->group_lock);
892 kref_init(&container->kref);
893
894 filep->private_data = container;
895
896 return 0;
897 }
898
vfio_fops_release(struct inode * inode,struct file * filep)899 static int vfio_fops_release(struct inode *inode, struct file *filep)
900 {
901 struct vfio_container *container = filep->private_data;
902 struct vfio_iommu_driver *driver = container->iommu_driver;
903
904 if (driver && driver->ops->notify)
905 driver->ops->notify(container->iommu_data,
906 VFIO_IOMMU_CONTAINER_CLOSE);
907
908 filep->private_data = NULL;
909
910 vfio_container_put(container);
911
912 return 0;
913 }
914
915 static const struct file_operations vfio_fops = {
916 .owner = THIS_MODULE,
917 .open = vfio_fops_open,
918 .release = vfio_fops_release,
919 .unlocked_ioctl = vfio_fops_unl_ioctl,
920 .compat_ioctl = compat_ptr_ioctl,
921 };
922
923 /*
924 * VFIO Group fd, /dev/vfio/$GROUP
925 */
__vfio_group_unset_container(struct vfio_group * group)926 static void __vfio_group_unset_container(struct vfio_group *group)
927 {
928 struct vfio_container *container = group->container;
929 struct vfio_iommu_driver *driver;
930
931 lockdep_assert_held_write(&group->group_rwsem);
932
933 down_write(&container->group_lock);
934
935 driver = container->iommu_driver;
936 if (driver)
937 driver->ops->detach_group(container->iommu_data,
938 group->iommu_group);
939
940 if (group->type == VFIO_IOMMU)
941 iommu_group_release_dma_owner(group->iommu_group);
942
943 group->container = NULL;
944 group->container_users = 0;
945 list_del(&group->container_next);
946
947 /* Detaching the last group deprivileges a container, remove iommu */
948 if (driver && list_empty(&container->group_list)) {
949 driver->ops->release(container->iommu_data);
950 module_put(driver->ops->owner);
951 container->iommu_driver = NULL;
952 container->iommu_data = NULL;
953 }
954
955 up_write(&container->group_lock);
956
957 vfio_container_put(container);
958 }
959
960 /*
961 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
962 * if there was no container to unset. Since the ioctl is called on
963 * the group, we know that still exists, therefore the only valid
964 * transition here is 1->0.
965 */
vfio_group_unset_container(struct vfio_group * group)966 static int vfio_group_unset_container(struct vfio_group *group)
967 {
968 lockdep_assert_held_write(&group->group_rwsem);
969
970 if (!group->container)
971 return -EINVAL;
972 if (group->container_users != 1)
973 return -EBUSY;
974 __vfio_group_unset_container(group);
975 return 0;
976 }
977
vfio_group_set_container(struct vfio_group * group,int container_fd)978 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
979 {
980 struct fd f;
981 struct vfio_container *container;
982 struct vfio_iommu_driver *driver;
983 int ret = 0;
984
985 lockdep_assert_held_write(&group->group_rwsem);
986
987 if (group->container || WARN_ON(group->container_users))
988 return -EINVAL;
989
990 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
991 return -EPERM;
992
993 f = fdget(container_fd);
994 if (!f.file)
995 return -EBADF;
996
997 /* Sanity check, is this really our fd? */
998 if (f.file->f_op != &vfio_fops) {
999 fdput(f);
1000 return -EINVAL;
1001 }
1002
1003 container = f.file->private_data;
1004 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1005
1006 down_write(&container->group_lock);
1007
1008 /* Real groups and fake groups cannot mix */
1009 if (!list_empty(&container->group_list) &&
1010 container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1011 ret = -EPERM;
1012 goto unlock_out;
1013 }
1014
1015 if (group->type == VFIO_IOMMU) {
1016 ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1017 if (ret)
1018 goto unlock_out;
1019 }
1020
1021 driver = container->iommu_driver;
1022 if (driver) {
1023 ret = driver->ops->attach_group(container->iommu_data,
1024 group->iommu_group,
1025 group->type);
1026 if (ret) {
1027 if (group->type == VFIO_IOMMU)
1028 iommu_group_release_dma_owner(
1029 group->iommu_group);
1030 goto unlock_out;
1031 }
1032 }
1033
1034 group->container = container;
1035 group->container_users = 1;
1036 container->noiommu = (group->type == VFIO_NO_IOMMU);
1037 list_add(&group->container_next, &container->group_list);
1038
1039 /* Get a reference on the container and mark a user within the group */
1040 vfio_container_get(container);
1041
1042 unlock_out:
1043 up_write(&container->group_lock);
1044 fdput(f);
1045 return ret;
1046 }
1047
1048 static const struct file_operations vfio_device_fops;
1049
1050 /* true if the vfio_device has open_device() called but not close_device() */
vfio_assert_device_open(struct vfio_device * device)1051 static bool vfio_assert_device_open(struct vfio_device *device)
1052 {
1053 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1054 }
1055
vfio_device_assign_container(struct vfio_device * device)1056 static int vfio_device_assign_container(struct vfio_device *device)
1057 {
1058 struct vfio_group *group = device->group;
1059
1060 lockdep_assert_held_write(&group->group_rwsem);
1061
1062 if (!group->container || !group->container->iommu_driver ||
1063 WARN_ON(!group->container_users))
1064 return -EINVAL;
1065
1066 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1067 return -EPERM;
1068
1069 get_file(group->opened_file);
1070 group->container_users++;
1071 return 0;
1072 }
1073
vfio_device_unassign_container(struct vfio_device * device)1074 static void vfio_device_unassign_container(struct vfio_device *device)
1075 {
1076 down_write(&device->group->group_rwsem);
1077 WARN_ON(device->group->container_users <= 1);
1078 device->group->container_users--;
1079 fput(device->group->opened_file);
1080 up_write(&device->group->group_rwsem);
1081 }
1082
vfio_device_open(struct vfio_device * device)1083 static struct file *vfio_device_open(struct vfio_device *device)
1084 {
1085 struct file *filep;
1086 int ret;
1087
1088 down_write(&device->group->group_rwsem);
1089 ret = vfio_device_assign_container(device);
1090 up_write(&device->group->group_rwsem);
1091 if (ret)
1092 return ERR_PTR(ret);
1093
1094 if (!try_module_get(device->dev->driver->owner)) {
1095 ret = -ENODEV;
1096 goto err_unassign_container;
1097 }
1098
1099 mutex_lock(&device->dev_set->lock);
1100 device->open_count++;
1101 if (device->open_count == 1) {
1102 /*
1103 * Here we pass the KVM pointer with the group under the read
1104 * lock. If the device driver will use it, it must obtain a
1105 * reference and release it during close_device.
1106 */
1107 down_read(&device->group->group_rwsem);
1108 device->kvm = device->group->kvm;
1109
1110 if (device->ops->open_device) {
1111 ret = device->ops->open_device(device);
1112 if (ret)
1113 goto err_undo_count;
1114 }
1115 up_read(&device->group->group_rwsem);
1116 }
1117 mutex_unlock(&device->dev_set->lock);
1118
1119 /*
1120 * We can't use anon_inode_getfd() because we need to modify
1121 * the f_mode flags directly to allow more than just ioctls
1122 */
1123 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1124 device, O_RDWR);
1125 if (IS_ERR(filep)) {
1126 ret = PTR_ERR(filep);
1127 goto err_close_device;
1128 }
1129
1130 /*
1131 * TODO: add an anon_inode interface to do this.
1132 * Appears to be missing by lack of need rather than
1133 * explicitly prevented. Now there's need.
1134 */
1135 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1136
1137 if (device->group->type == VFIO_NO_IOMMU)
1138 dev_warn(device->dev, "vfio-noiommu device opened by user "
1139 "(%s:%d)\n", current->comm, task_pid_nr(current));
1140 /*
1141 * On success the ref of device is moved to the file and
1142 * put in vfio_device_fops_release()
1143 */
1144 return filep;
1145
1146 err_close_device:
1147 mutex_lock(&device->dev_set->lock);
1148 down_read(&device->group->group_rwsem);
1149 if (device->open_count == 1 && device->ops->close_device)
1150 device->ops->close_device(device);
1151 err_undo_count:
1152 device->open_count--;
1153 if (device->open_count == 0 && device->kvm)
1154 device->kvm = NULL;
1155 up_read(&device->group->group_rwsem);
1156 mutex_unlock(&device->dev_set->lock);
1157 module_put(device->dev->driver->owner);
1158 err_unassign_container:
1159 vfio_device_unassign_container(device);
1160 return ERR_PTR(ret);
1161 }
1162
vfio_group_get_device_fd(struct vfio_group * group,char * buf)1163 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1164 {
1165 struct vfio_device *device;
1166 struct file *filep;
1167 int fdno;
1168 int ret;
1169
1170 device = vfio_device_get_from_name(group, buf);
1171 if (IS_ERR(device))
1172 return PTR_ERR(device);
1173
1174 fdno = get_unused_fd_flags(O_CLOEXEC);
1175 if (fdno < 0) {
1176 ret = fdno;
1177 goto err_put_device;
1178 }
1179
1180 filep = vfio_device_open(device);
1181 if (IS_ERR(filep)) {
1182 ret = PTR_ERR(filep);
1183 goto err_put_fdno;
1184 }
1185
1186 fd_install(fdno, filep);
1187 return fdno;
1188
1189 err_put_fdno:
1190 put_unused_fd(fdno);
1191 err_put_device:
1192 vfio_device_put(device);
1193 return ret;
1194 }
1195
vfio_group_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1196 static long vfio_group_fops_unl_ioctl(struct file *filep,
1197 unsigned int cmd, unsigned long arg)
1198 {
1199 struct vfio_group *group = filep->private_data;
1200 long ret = -ENOTTY;
1201
1202 switch (cmd) {
1203 case VFIO_GROUP_GET_STATUS:
1204 {
1205 struct vfio_group_status status;
1206 unsigned long minsz;
1207
1208 minsz = offsetofend(struct vfio_group_status, flags);
1209
1210 if (copy_from_user(&status, (void __user *)arg, minsz))
1211 return -EFAULT;
1212
1213 if (status.argsz < minsz)
1214 return -EINVAL;
1215
1216 status.flags = 0;
1217
1218 down_read(&group->group_rwsem);
1219 if (group->container)
1220 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1221 VFIO_GROUP_FLAGS_VIABLE;
1222 else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1223 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1224 up_read(&group->group_rwsem);
1225
1226 if (copy_to_user((void __user *)arg, &status, minsz))
1227 return -EFAULT;
1228
1229 ret = 0;
1230 break;
1231 }
1232 case VFIO_GROUP_SET_CONTAINER:
1233 {
1234 int fd;
1235
1236 if (get_user(fd, (int __user *)arg))
1237 return -EFAULT;
1238
1239 if (fd < 0)
1240 return -EINVAL;
1241
1242 down_write(&group->group_rwsem);
1243 ret = vfio_group_set_container(group, fd);
1244 up_write(&group->group_rwsem);
1245 break;
1246 }
1247 case VFIO_GROUP_UNSET_CONTAINER:
1248 down_write(&group->group_rwsem);
1249 ret = vfio_group_unset_container(group);
1250 up_write(&group->group_rwsem);
1251 break;
1252 case VFIO_GROUP_GET_DEVICE_FD:
1253 {
1254 char *buf;
1255
1256 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1257 if (IS_ERR(buf))
1258 return PTR_ERR(buf);
1259
1260 ret = vfio_group_get_device_fd(group, buf);
1261 kfree(buf);
1262 break;
1263 }
1264 }
1265
1266 return ret;
1267 }
1268
vfio_group_fops_open(struct inode * inode,struct file * filep)1269 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1270 {
1271 struct vfio_group *group =
1272 container_of(inode->i_cdev, struct vfio_group, cdev);
1273 int ret;
1274
1275 down_write(&group->group_rwsem);
1276
1277 /* users can be zero if this races with vfio_group_put() */
1278 if (!refcount_inc_not_zero(&group->users)) {
1279 ret = -ENODEV;
1280 goto err_unlock;
1281 }
1282
1283 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1284 ret = -EPERM;
1285 goto err_put;
1286 }
1287
1288 /*
1289 * Do we need multiple instances of the group open? Seems not.
1290 */
1291 if (group->opened_file) {
1292 ret = -EBUSY;
1293 goto err_put;
1294 }
1295 group->opened_file = filep;
1296 filep->private_data = group;
1297
1298 up_write(&group->group_rwsem);
1299 return 0;
1300 err_put:
1301 vfio_group_put(group);
1302 err_unlock:
1303 up_write(&group->group_rwsem);
1304 return ret;
1305 }
1306
vfio_group_fops_release(struct inode * inode,struct file * filep)1307 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1308 {
1309 struct vfio_group *group = filep->private_data;
1310
1311 filep->private_data = NULL;
1312
1313 down_write(&group->group_rwsem);
1314 /*
1315 * Device FDs hold a group file reference, therefore the group release
1316 * is only called when there are no open devices.
1317 */
1318 WARN_ON(group->notifier.head);
1319 if (group->container) {
1320 WARN_ON(group->container_users != 1);
1321 __vfio_group_unset_container(group);
1322 }
1323 group->opened_file = NULL;
1324 up_write(&group->group_rwsem);
1325
1326 vfio_group_put(group);
1327
1328 return 0;
1329 }
1330
1331 static const struct file_operations vfio_group_fops = {
1332 .owner = THIS_MODULE,
1333 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1334 .compat_ioctl = compat_ptr_ioctl,
1335 .open = vfio_group_fops_open,
1336 .release = vfio_group_fops_release,
1337 };
1338
1339 /*
1340 * VFIO Device fd
1341 */
vfio_device_fops_release(struct inode * inode,struct file * filep)1342 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1343 {
1344 struct vfio_device *device = filep->private_data;
1345
1346 mutex_lock(&device->dev_set->lock);
1347 vfio_assert_device_open(device);
1348 down_read(&device->group->group_rwsem);
1349 if (device->open_count == 1 && device->ops->close_device)
1350 device->ops->close_device(device);
1351 up_read(&device->group->group_rwsem);
1352 device->open_count--;
1353 if (device->open_count == 0)
1354 device->kvm = NULL;
1355 mutex_unlock(&device->dev_set->lock);
1356
1357 module_put(device->dev->driver->owner);
1358
1359 vfio_device_unassign_container(device);
1360
1361 vfio_device_put(device);
1362
1363 return 0;
1364 }
1365
1366 /*
1367 * vfio_mig_get_next_state - Compute the next step in the FSM
1368 * @cur_fsm - The current state the device is in
1369 * @new_fsm - The target state to reach
1370 * @next_fsm - Pointer to the next step to get to new_fsm
1371 *
1372 * Return 0 upon success, otherwise -errno
1373 * Upon success the next step in the state progression between cur_fsm and
1374 * new_fsm will be set in next_fsm.
1375 *
1376 * This breaks down requests for combination transitions into smaller steps and
1377 * returns the next step to get to new_fsm. The function may need to be called
1378 * multiple times before reaching new_fsm.
1379 *
1380 */
vfio_mig_get_next_state(struct vfio_device * device,enum vfio_device_mig_state cur_fsm,enum vfio_device_mig_state new_fsm,enum vfio_device_mig_state * next_fsm)1381 int vfio_mig_get_next_state(struct vfio_device *device,
1382 enum vfio_device_mig_state cur_fsm,
1383 enum vfio_device_mig_state new_fsm,
1384 enum vfio_device_mig_state *next_fsm)
1385 {
1386 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1387 /*
1388 * The coding in this table requires the driver to implement the
1389 * following FSM arcs:
1390 * RESUMING -> STOP
1391 * STOP -> RESUMING
1392 * STOP -> STOP_COPY
1393 * STOP_COPY -> STOP
1394 *
1395 * If P2P is supported then the driver must also implement these FSM
1396 * arcs:
1397 * RUNNING -> RUNNING_P2P
1398 * RUNNING_P2P -> RUNNING
1399 * RUNNING_P2P -> STOP
1400 * STOP -> RUNNING_P2P
1401 * Without P2P the driver must implement:
1402 * RUNNING -> STOP
1403 * STOP -> RUNNING
1404 *
1405 * The coding will step through multiple states for some combination
1406 * transitions; if all optional features are supported, this means the
1407 * following ones:
1408 * RESUMING -> STOP -> RUNNING_P2P
1409 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1410 * RESUMING -> STOP -> STOP_COPY
1411 * RUNNING -> RUNNING_P2P -> STOP
1412 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1413 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1414 * RUNNING_P2P -> STOP -> RESUMING
1415 * RUNNING_P2P -> STOP -> STOP_COPY
1416 * STOP -> RUNNING_P2P -> RUNNING
1417 * STOP_COPY -> STOP -> RESUMING
1418 * STOP_COPY -> STOP -> RUNNING_P2P
1419 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1420 */
1421 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1422 [VFIO_DEVICE_STATE_STOP] = {
1423 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1424 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1425 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1426 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1427 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1428 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1429 },
1430 [VFIO_DEVICE_STATE_RUNNING] = {
1431 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1432 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1433 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1434 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1435 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1436 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1437 },
1438 [VFIO_DEVICE_STATE_STOP_COPY] = {
1439 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1440 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1441 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1442 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1443 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1444 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1445 },
1446 [VFIO_DEVICE_STATE_RESUMING] = {
1447 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1448 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1449 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1450 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1451 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1452 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1453 },
1454 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
1455 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1456 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1457 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1458 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1459 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1460 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1461 },
1462 [VFIO_DEVICE_STATE_ERROR] = {
1463 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1464 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1465 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1466 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1467 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1468 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1469 },
1470 };
1471
1472 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1473 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1474 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1475 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1476 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1477 [VFIO_DEVICE_STATE_RUNNING_P2P] =
1478 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1479 [VFIO_DEVICE_STATE_ERROR] = ~0U,
1480 };
1481
1482 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1483 (state_flags_table[cur_fsm] & device->migration_flags) !=
1484 state_flags_table[cur_fsm]))
1485 return -EINVAL;
1486
1487 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1488 (state_flags_table[new_fsm] & device->migration_flags) !=
1489 state_flags_table[new_fsm])
1490 return -EINVAL;
1491
1492 /*
1493 * Arcs touching optional and unsupported states are skipped over. The
1494 * driver will instead see an arc from the original state to the next
1495 * logical state, as per the above comment.
1496 */
1497 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1498 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1499 state_flags_table[*next_fsm])
1500 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1501
1502 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1503 }
1504 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1505
1506 /*
1507 * Convert the drivers's struct file into a FD number and return it to userspace
1508 */
vfio_ioct_mig_return_fd(struct file * filp,void __user * arg,struct vfio_device_feature_mig_state * mig)1509 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1510 struct vfio_device_feature_mig_state *mig)
1511 {
1512 int ret;
1513 int fd;
1514
1515 fd = get_unused_fd_flags(O_CLOEXEC);
1516 if (fd < 0) {
1517 ret = fd;
1518 goto out_fput;
1519 }
1520
1521 mig->data_fd = fd;
1522 if (copy_to_user(arg, mig, sizeof(*mig))) {
1523 ret = -EFAULT;
1524 goto out_put_unused;
1525 }
1526 fd_install(fd, filp);
1527 return 0;
1528
1529 out_put_unused:
1530 put_unused_fd(fd);
1531 out_fput:
1532 fput(filp);
1533 return ret;
1534 }
1535
1536 static int
vfio_ioctl_device_feature_mig_device_state(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1537 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1538 u32 flags, void __user *arg,
1539 size_t argsz)
1540 {
1541 size_t minsz =
1542 offsetofend(struct vfio_device_feature_mig_state, data_fd);
1543 struct vfio_device_feature_mig_state mig;
1544 struct file *filp = NULL;
1545 int ret;
1546
1547 if (!device->mig_ops)
1548 return -ENOTTY;
1549
1550 ret = vfio_check_feature(flags, argsz,
1551 VFIO_DEVICE_FEATURE_SET |
1552 VFIO_DEVICE_FEATURE_GET,
1553 sizeof(mig));
1554 if (ret != 1)
1555 return ret;
1556
1557 if (copy_from_user(&mig, arg, minsz))
1558 return -EFAULT;
1559
1560 if (flags & VFIO_DEVICE_FEATURE_GET) {
1561 enum vfio_device_mig_state curr_state;
1562
1563 ret = device->mig_ops->migration_get_state(device,
1564 &curr_state);
1565 if (ret)
1566 return ret;
1567 mig.device_state = curr_state;
1568 goto out_copy;
1569 }
1570
1571 /* Handle the VFIO_DEVICE_FEATURE_SET */
1572 filp = device->mig_ops->migration_set_state(device, mig.device_state);
1573 if (IS_ERR(filp) || !filp)
1574 goto out_copy;
1575
1576 return vfio_ioct_mig_return_fd(filp, arg, &mig);
1577 out_copy:
1578 mig.data_fd = -1;
1579 if (copy_to_user(arg, &mig, sizeof(mig)))
1580 return -EFAULT;
1581 if (IS_ERR(filp))
1582 return PTR_ERR(filp);
1583 return 0;
1584 }
1585
vfio_ioctl_device_feature_migration(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1586 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1587 u32 flags, void __user *arg,
1588 size_t argsz)
1589 {
1590 struct vfio_device_feature_migration mig = {
1591 .flags = device->migration_flags,
1592 };
1593 int ret;
1594
1595 if (!device->mig_ops)
1596 return -ENOTTY;
1597
1598 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1599 sizeof(mig));
1600 if (ret != 1)
1601 return ret;
1602 if (copy_to_user(arg, &mig, sizeof(mig)))
1603 return -EFAULT;
1604 return 0;
1605 }
1606
vfio_ioctl_device_feature(struct vfio_device * device,struct vfio_device_feature __user * arg)1607 static int vfio_ioctl_device_feature(struct vfio_device *device,
1608 struct vfio_device_feature __user *arg)
1609 {
1610 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1611 struct vfio_device_feature feature;
1612
1613 if (copy_from_user(&feature, arg, minsz))
1614 return -EFAULT;
1615
1616 if (feature.argsz < minsz)
1617 return -EINVAL;
1618
1619 /* Check unknown flags */
1620 if (feature.flags &
1621 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1622 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1623 return -EINVAL;
1624
1625 /* GET & SET are mutually exclusive except with PROBE */
1626 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1627 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1628 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1629 return -EINVAL;
1630
1631 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1632 case VFIO_DEVICE_FEATURE_MIGRATION:
1633 return vfio_ioctl_device_feature_migration(
1634 device, feature.flags, arg->data,
1635 feature.argsz - minsz);
1636 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1637 return vfio_ioctl_device_feature_mig_device_state(
1638 device, feature.flags, arg->data,
1639 feature.argsz - minsz);
1640 default:
1641 if (unlikely(!device->ops->device_feature))
1642 return -EINVAL;
1643 return device->ops->device_feature(device, feature.flags,
1644 arg->data,
1645 feature.argsz - minsz);
1646 }
1647 }
1648
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1649 static long vfio_device_fops_unl_ioctl(struct file *filep,
1650 unsigned int cmd, unsigned long arg)
1651 {
1652 struct vfio_device *device = filep->private_data;
1653
1654 switch (cmd) {
1655 case VFIO_DEVICE_FEATURE:
1656 return vfio_ioctl_device_feature(device, (void __user *)arg);
1657 default:
1658 if (unlikely(!device->ops->ioctl))
1659 return -EINVAL;
1660 return device->ops->ioctl(device, cmd, arg);
1661 }
1662 }
1663
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1664 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1665 size_t count, loff_t *ppos)
1666 {
1667 struct vfio_device *device = filep->private_data;
1668
1669 if (unlikely(!device->ops->read))
1670 return -EINVAL;
1671
1672 return device->ops->read(device, buf, count, ppos);
1673 }
1674
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1675 static ssize_t vfio_device_fops_write(struct file *filep,
1676 const char __user *buf,
1677 size_t count, loff_t *ppos)
1678 {
1679 struct vfio_device *device = filep->private_data;
1680
1681 if (unlikely(!device->ops->write))
1682 return -EINVAL;
1683
1684 return device->ops->write(device, buf, count, ppos);
1685 }
1686
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1687 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1688 {
1689 struct vfio_device *device = filep->private_data;
1690
1691 if (unlikely(!device->ops->mmap))
1692 return -EINVAL;
1693
1694 return device->ops->mmap(device, vma);
1695 }
1696
1697 static const struct file_operations vfio_device_fops = {
1698 .owner = THIS_MODULE,
1699 .release = vfio_device_fops_release,
1700 .read = vfio_device_fops_read,
1701 .write = vfio_device_fops_write,
1702 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1703 .compat_ioctl = compat_ptr_ioctl,
1704 .mmap = vfio_device_fops_mmap,
1705 };
1706
1707 /**
1708 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
1709 * @file: VFIO group file
1710 *
1711 * The returned iommu_group is valid as long as a ref is held on the file.
1712 */
vfio_file_iommu_group(struct file * file)1713 struct iommu_group *vfio_file_iommu_group(struct file *file)
1714 {
1715 struct vfio_group *group = file->private_data;
1716
1717 if (file->f_op != &vfio_group_fops)
1718 return NULL;
1719 return group->iommu_group;
1720 }
1721 EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
1722
1723 /**
1724 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1725 * is always CPU cache coherent
1726 * @file: VFIO group file
1727 *
1728 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1729 * bit in DMA transactions. A return of false indicates that the user has
1730 * rights to access additional instructions such as wbinvd on x86.
1731 */
vfio_file_enforced_coherent(struct file * file)1732 bool vfio_file_enforced_coherent(struct file *file)
1733 {
1734 struct vfio_group *group = file->private_data;
1735 bool ret;
1736
1737 if (file->f_op != &vfio_group_fops)
1738 return true;
1739
1740 down_read(&group->group_rwsem);
1741 if (group->container) {
1742 ret = vfio_ioctl_check_extension(group->container,
1743 VFIO_DMA_CC_IOMMU);
1744 } else {
1745 /*
1746 * Since the coherency state is determined only once a container
1747 * is attached the user must do so before they can prove they
1748 * have permission.
1749 */
1750 ret = true;
1751 }
1752 up_read(&group->group_rwsem);
1753 return ret;
1754 }
1755 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1756
1757 /**
1758 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1759 * @file: VFIO group file
1760 * @kvm: KVM to link
1761 *
1762 * When a VFIO device is first opened the KVM will be available in
1763 * device->kvm if one was associated with the group.
1764 */
vfio_file_set_kvm(struct file * file,struct kvm * kvm)1765 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1766 {
1767 struct vfio_group *group = file->private_data;
1768
1769 if (file->f_op != &vfio_group_fops)
1770 return;
1771
1772 down_write(&group->group_rwsem);
1773 group->kvm = kvm;
1774 up_write(&group->group_rwsem);
1775 }
1776 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1777
1778 /**
1779 * vfio_file_has_dev - True if the VFIO file is a handle for device
1780 * @file: VFIO file to check
1781 * @device: Device that must be part of the file
1782 *
1783 * Returns true if given file has permission to manipulate the given device.
1784 */
vfio_file_has_dev(struct file * file,struct vfio_device * device)1785 bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
1786 {
1787 struct vfio_group *group = file->private_data;
1788
1789 if (file->f_op != &vfio_group_fops)
1790 return false;
1791
1792 return group == device->group;
1793 }
1794 EXPORT_SYMBOL_GPL(vfio_file_has_dev);
1795
1796 /*
1797 * Sub-module support
1798 */
1799 /*
1800 * Helper for managing a buffer of info chain capabilities, allocate or
1801 * reallocate a buffer with additional @size, filling in @id and @version
1802 * of the capability. A pointer to the new capability is returned.
1803 *
1804 * NB. The chain is based at the head of the buffer, so new entries are
1805 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1806 * next offsets prior to copying to the user buffer.
1807 */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1808 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1809 size_t size, u16 id, u16 version)
1810 {
1811 void *buf;
1812 struct vfio_info_cap_header *header, *tmp;
1813
1814 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1815 if (!buf) {
1816 kfree(caps->buf);
1817 caps->buf = NULL;
1818 caps->size = 0;
1819 return ERR_PTR(-ENOMEM);
1820 }
1821
1822 caps->buf = buf;
1823 header = buf + caps->size;
1824
1825 /* Eventually copied to user buffer, zero */
1826 memset(header, 0, size);
1827
1828 header->id = id;
1829 header->version = version;
1830
1831 /* Add to the end of the capability chain */
1832 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1833 ; /* nothing */
1834
1835 tmp->next = caps->size;
1836 caps->size += size;
1837
1838 return header;
1839 }
1840 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1841
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1842 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1843 {
1844 struct vfio_info_cap_header *tmp;
1845 void *buf = (void *)caps->buf;
1846
1847 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1848 tmp->next += offset;
1849 }
1850 EXPORT_SYMBOL(vfio_info_cap_shift);
1851
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1852 int vfio_info_add_capability(struct vfio_info_cap *caps,
1853 struct vfio_info_cap_header *cap, size_t size)
1854 {
1855 struct vfio_info_cap_header *header;
1856
1857 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1858 if (IS_ERR(header))
1859 return PTR_ERR(header);
1860
1861 memcpy(header + 1, cap + 1, size - sizeof(*header));
1862
1863 return 0;
1864 }
1865 EXPORT_SYMBOL(vfio_info_add_capability);
1866
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1867 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1868 int max_irq_type, size_t *data_size)
1869 {
1870 unsigned long minsz;
1871 size_t size;
1872
1873 minsz = offsetofend(struct vfio_irq_set, count);
1874
1875 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1876 (hdr->count >= (U32_MAX - hdr->start)) ||
1877 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1878 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1879 return -EINVAL;
1880
1881 if (data_size)
1882 *data_size = 0;
1883
1884 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1885 return -EINVAL;
1886
1887 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1888 case VFIO_IRQ_SET_DATA_NONE:
1889 size = 0;
1890 break;
1891 case VFIO_IRQ_SET_DATA_BOOL:
1892 size = sizeof(uint8_t);
1893 break;
1894 case VFIO_IRQ_SET_DATA_EVENTFD:
1895 size = sizeof(int32_t);
1896 break;
1897 default:
1898 return -EINVAL;
1899 }
1900
1901 if (size) {
1902 if (hdr->argsz - minsz < hdr->count * size)
1903 return -EINVAL;
1904
1905 if (!data_size)
1906 return -EINVAL;
1907
1908 *data_size = hdr->count * size;
1909 }
1910
1911 return 0;
1912 }
1913 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1914
1915 /*
1916 * Pin a set of guest PFNs and return their associated host PFNs for local
1917 * domain only.
1918 * @device [in] : device
1919 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1920 * @npage [in] : count of elements in user_pfn array. This count should not
1921 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1922 * @prot [in] : protection flags
1923 * @phys_pfn[out]: array of host PFNs
1924 * Return error or number of pages pinned.
1925 */
vfio_pin_pages(struct vfio_device * device,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)1926 int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
1927 int npage, int prot, unsigned long *phys_pfn)
1928 {
1929 struct vfio_container *container;
1930 struct vfio_group *group = device->group;
1931 struct vfio_iommu_driver *driver;
1932 int ret;
1933
1934 if (!user_pfn || !phys_pfn || !npage ||
1935 !vfio_assert_device_open(device))
1936 return -EINVAL;
1937
1938 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1939 return -E2BIG;
1940
1941 if (group->dev_counter > 1)
1942 return -EINVAL;
1943
1944 /* group->container cannot change while a vfio device is open */
1945 container = group->container;
1946 driver = container->iommu_driver;
1947 if (likely(driver && driver->ops->pin_pages))
1948 ret = driver->ops->pin_pages(container->iommu_data,
1949 group->iommu_group, user_pfn,
1950 npage, prot, phys_pfn);
1951 else
1952 ret = -ENOTTY;
1953
1954 return ret;
1955 }
1956 EXPORT_SYMBOL(vfio_pin_pages);
1957
1958 /*
1959 * Unpin set of host PFNs for local domain only.
1960 * @device [in] : device
1961 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1962 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1963 * @npage [in] : count of elements in user_pfn array. This count should not
1964 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1965 * Return error or number of pages unpinned.
1966 */
vfio_unpin_pages(struct vfio_device * device,unsigned long * user_pfn,int npage)1967 int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
1968 int npage)
1969 {
1970 struct vfio_container *container;
1971 struct vfio_iommu_driver *driver;
1972 int ret;
1973
1974 if (!user_pfn || !npage || !vfio_assert_device_open(device))
1975 return -EINVAL;
1976
1977 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1978 return -E2BIG;
1979
1980 /* group->container cannot change while a vfio device is open */
1981 container = device->group->container;
1982 driver = container->iommu_driver;
1983 if (likely(driver && driver->ops->unpin_pages))
1984 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1985 npage);
1986 else
1987 ret = -ENOTTY;
1988
1989 return ret;
1990 }
1991 EXPORT_SYMBOL(vfio_unpin_pages);
1992
1993 /*
1994 * This interface allows the CPUs to perform some sort of virtual DMA on
1995 * behalf of the device.
1996 *
1997 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1998 * into/from a kernel buffer.
1999 *
2000 * As the read/write of user space memory is conducted via the CPUs and is
2001 * not a real device DMA, it is not necessary to pin the user space memory.
2002 *
2003 * @device [in] : VFIO device
2004 * @user_iova [in] : base IOVA of a user space buffer
2005 * @data [in] : pointer to kernel buffer
2006 * @len [in] : kernel buffer length
2007 * @write : indicate read or write
2008 * Return error code on failure or 0 on success.
2009 */
vfio_dma_rw(struct vfio_device * device,dma_addr_t user_iova,void * data,size_t len,bool write)2010 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
2011 size_t len, bool write)
2012 {
2013 struct vfio_container *container;
2014 struct vfio_iommu_driver *driver;
2015 int ret = 0;
2016
2017 if (!data || len <= 0 || !vfio_assert_device_open(device))
2018 return -EINVAL;
2019
2020 /* group->container cannot change while a vfio device is open */
2021 container = device->group->container;
2022 driver = container->iommu_driver;
2023
2024 if (likely(driver && driver->ops->dma_rw))
2025 ret = driver->ops->dma_rw(container->iommu_data,
2026 user_iova, data, len, write);
2027 else
2028 ret = -ENOTTY;
2029 return ret;
2030 }
2031 EXPORT_SYMBOL(vfio_dma_rw);
2032
vfio_register_iommu_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2033 static int vfio_register_iommu_notifier(struct vfio_group *group,
2034 unsigned long *events,
2035 struct notifier_block *nb)
2036 {
2037 struct vfio_container *container;
2038 struct vfio_iommu_driver *driver;
2039 int ret;
2040
2041 lockdep_assert_held_read(&group->group_rwsem);
2042
2043 container = group->container;
2044 driver = container->iommu_driver;
2045 if (likely(driver && driver->ops->register_notifier))
2046 ret = driver->ops->register_notifier(container->iommu_data,
2047 events, nb);
2048 else
2049 ret = -ENOTTY;
2050
2051 return ret;
2052 }
2053
vfio_unregister_iommu_notifier(struct vfio_group * group,struct notifier_block * nb)2054 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2055 struct notifier_block *nb)
2056 {
2057 struct vfio_container *container;
2058 struct vfio_iommu_driver *driver;
2059 int ret;
2060
2061 lockdep_assert_held_read(&group->group_rwsem);
2062
2063 container = group->container;
2064 driver = container->iommu_driver;
2065 if (likely(driver && driver->ops->unregister_notifier))
2066 ret = driver->ops->unregister_notifier(container->iommu_data,
2067 nb);
2068 else
2069 ret = -ENOTTY;
2070
2071 return ret;
2072 }
2073
vfio_register_notifier(struct vfio_device * device,enum vfio_notify_type type,unsigned long * events,struct notifier_block * nb)2074 int vfio_register_notifier(struct vfio_device *device,
2075 enum vfio_notify_type type, unsigned long *events,
2076 struct notifier_block *nb)
2077 {
2078 struct vfio_group *group = device->group;
2079 int ret;
2080
2081 if (!nb || !events || (*events == 0) ||
2082 !vfio_assert_device_open(device))
2083 return -EINVAL;
2084
2085 switch (type) {
2086 case VFIO_IOMMU_NOTIFY:
2087 ret = vfio_register_iommu_notifier(group, events, nb);
2088 break;
2089 default:
2090 ret = -EINVAL;
2091 }
2092 return ret;
2093 }
2094 EXPORT_SYMBOL(vfio_register_notifier);
2095
vfio_unregister_notifier(struct vfio_device * device,enum vfio_notify_type type,struct notifier_block * nb)2096 int vfio_unregister_notifier(struct vfio_device *device,
2097 enum vfio_notify_type type,
2098 struct notifier_block *nb)
2099 {
2100 struct vfio_group *group = device->group;
2101 int ret;
2102
2103 if (!nb || !vfio_assert_device_open(device))
2104 return -EINVAL;
2105
2106 switch (type) {
2107 case VFIO_IOMMU_NOTIFY:
2108 ret = vfio_unregister_iommu_notifier(group, nb);
2109 break;
2110 default:
2111 ret = -EINVAL;
2112 }
2113 return ret;
2114 }
2115 EXPORT_SYMBOL(vfio_unregister_notifier);
2116
2117 /*
2118 * Module/class support
2119 */
vfio_devnode(struct device * dev,umode_t * mode)2120 static char *vfio_devnode(struct device *dev, umode_t *mode)
2121 {
2122 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2123 }
2124
2125 static struct miscdevice vfio_dev = {
2126 .minor = VFIO_MINOR,
2127 .name = "vfio",
2128 .fops = &vfio_fops,
2129 .nodename = "vfio/vfio",
2130 .mode = S_IRUGO | S_IWUGO,
2131 };
2132
vfio_init(void)2133 static int __init vfio_init(void)
2134 {
2135 int ret;
2136
2137 ida_init(&vfio.group_ida);
2138 mutex_init(&vfio.group_lock);
2139 mutex_init(&vfio.iommu_drivers_lock);
2140 INIT_LIST_HEAD(&vfio.group_list);
2141 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2142
2143 ret = misc_register(&vfio_dev);
2144 if (ret) {
2145 pr_err("vfio: misc device register failed\n");
2146 return ret;
2147 }
2148
2149 /* /dev/vfio/$GROUP */
2150 vfio.class = class_create(THIS_MODULE, "vfio");
2151 if (IS_ERR(vfio.class)) {
2152 ret = PTR_ERR(vfio.class);
2153 goto err_class;
2154 }
2155
2156 vfio.class->devnode = vfio_devnode;
2157
2158 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2159 if (ret)
2160 goto err_alloc_chrdev;
2161
2162 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2163
2164 #ifdef CONFIG_VFIO_NOIOMMU
2165 vfio_register_iommu_driver(&vfio_noiommu_ops);
2166 #endif
2167 return 0;
2168
2169 err_alloc_chrdev:
2170 class_destroy(vfio.class);
2171 vfio.class = NULL;
2172 err_class:
2173 misc_deregister(&vfio_dev);
2174 return ret;
2175 }
2176
vfio_cleanup(void)2177 static void __exit vfio_cleanup(void)
2178 {
2179 WARN_ON(!list_empty(&vfio.group_list));
2180
2181 #ifdef CONFIG_VFIO_NOIOMMU
2182 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2183 #endif
2184 ida_destroy(&vfio.group_ida);
2185 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2186 class_destroy(vfio.class);
2187 vfio.class = NULL;
2188 misc_deregister(&vfio_dev);
2189 xa_destroy(&vfio_device_set_xa);
2190 }
2191
2192 module_init(vfio_init);
2193 module_exit(vfio_cleanup);
2194
2195 MODULE_VERSION(DRIVER_VERSION);
2196 MODULE_LICENSE("GPL v2");
2197 MODULE_AUTHOR(DRIVER_AUTHOR);
2198 MODULE_DESCRIPTION(DRIVER_DESC);
2199 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2200 MODULE_ALIAS("devname:vfio/vfio");
2201 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2202