1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
18 #include <linux/fs.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
35 #include "vfio.h"
36 
37 #define DRIVER_VERSION	"0.3"
38 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
39 #define DRIVER_DESC	"VFIO - User Level meta-driver"
40 
41 static struct vfio {
42 	struct class			*class;
43 	struct list_head		iommu_drivers_list;
44 	struct mutex			iommu_drivers_lock;
45 	struct list_head		group_list;
46 	struct mutex			group_lock; /* locks group_list */
47 	struct ida			group_ida;
48 	dev_t				group_devt;
49 } vfio;
50 
51 struct vfio_iommu_driver {
52 	const struct vfio_iommu_driver_ops	*ops;
53 	struct list_head			vfio_next;
54 };
55 
56 struct vfio_container {
57 	struct kref			kref;
58 	struct list_head		group_list;
59 	struct rw_semaphore		group_lock;
60 	struct vfio_iommu_driver	*iommu_driver;
61 	void				*iommu_data;
62 	bool				noiommu;
63 };
64 
65 struct vfio_group {
66 	struct device 			dev;
67 	struct cdev			cdev;
68 	refcount_t			users;
69 	unsigned int			container_users;
70 	struct iommu_group		*iommu_group;
71 	struct vfio_container		*container;
72 	struct list_head		device_list;
73 	struct mutex			device_lock;
74 	struct list_head		vfio_next;
75 	struct list_head		container_next;
76 	enum vfio_group_type		type;
77 	unsigned int			dev_counter;
78 	struct rw_semaphore		group_rwsem;
79 	struct kvm			*kvm;
80 	struct file			*opened_file;
81 	struct blocking_notifier_head	notifier;
82 };
83 
84 #ifdef CONFIG_VFIO_NOIOMMU
85 static bool noiommu __read_mostly;
86 module_param_named(enable_unsafe_noiommu_mode,
87 		   noiommu, bool, S_IRUGO | S_IWUSR);
88 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
89 #endif
90 
91 static DEFINE_XARRAY(vfio_device_set_xa);
92 static const struct file_operations vfio_group_fops;
93 
vfio_assign_device_set(struct vfio_device * device,void * set_id)94 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
95 {
96 	unsigned long idx = (unsigned long)set_id;
97 	struct vfio_device_set *new_dev_set;
98 	struct vfio_device_set *dev_set;
99 
100 	if (WARN_ON(!set_id))
101 		return -EINVAL;
102 
103 	/*
104 	 * Atomically acquire a singleton object in the xarray for this set_id
105 	 */
106 	xa_lock(&vfio_device_set_xa);
107 	dev_set = xa_load(&vfio_device_set_xa, idx);
108 	if (dev_set)
109 		goto found_get_ref;
110 	xa_unlock(&vfio_device_set_xa);
111 
112 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
113 	if (!new_dev_set)
114 		return -ENOMEM;
115 	mutex_init(&new_dev_set->lock);
116 	INIT_LIST_HEAD(&new_dev_set->device_list);
117 	new_dev_set->set_id = set_id;
118 
119 	xa_lock(&vfio_device_set_xa);
120 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
121 			       GFP_KERNEL);
122 	if (!dev_set) {
123 		dev_set = new_dev_set;
124 		goto found_get_ref;
125 	}
126 
127 	kfree(new_dev_set);
128 	if (xa_is_err(dev_set)) {
129 		xa_unlock(&vfio_device_set_xa);
130 		return xa_err(dev_set);
131 	}
132 
133 found_get_ref:
134 	dev_set->device_count++;
135 	xa_unlock(&vfio_device_set_xa);
136 	mutex_lock(&dev_set->lock);
137 	device->dev_set = dev_set;
138 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
139 	mutex_unlock(&dev_set->lock);
140 	return 0;
141 }
142 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
143 
vfio_release_device_set(struct vfio_device * device)144 static void vfio_release_device_set(struct vfio_device *device)
145 {
146 	struct vfio_device_set *dev_set = device->dev_set;
147 
148 	if (!dev_set)
149 		return;
150 
151 	mutex_lock(&dev_set->lock);
152 	list_del(&device->dev_set_list);
153 	mutex_unlock(&dev_set->lock);
154 
155 	xa_lock(&vfio_device_set_xa);
156 	if (!--dev_set->device_count) {
157 		__xa_erase(&vfio_device_set_xa,
158 			   (unsigned long)dev_set->set_id);
159 		mutex_destroy(&dev_set->lock);
160 		kfree(dev_set);
161 	}
162 	xa_unlock(&vfio_device_set_xa);
163 }
164 
165 #ifdef CONFIG_VFIO_NOIOMMU
vfio_noiommu_open(unsigned long arg)166 static void *vfio_noiommu_open(unsigned long arg)
167 {
168 	if (arg != VFIO_NOIOMMU_IOMMU)
169 		return ERR_PTR(-EINVAL);
170 	if (!capable(CAP_SYS_RAWIO))
171 		return ERR_PTR(-EPERM);
172 
173 	return NULL;
174 }
175 
vfio_noiommu_release(void * iommu_data)176 static void vfio_noiommu_release(void *iommu_data)
177 {
178 }
179 
vfio_noiommu_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)180 static long vfio_noiommu_ioctl(void *iommu_data,
181 			       unsigned int cmd, unsigned long arg)
182 {
183 	if (cmd == VFIO_CHECK_EXTENSION)
184 		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
185 
186 	return -ENOTTY;
187 }
188 
vfio_noiommu_attach_group(void * iommu_data,struct iommu_group * iommu_group,enum vfio_group_type type)189 static int vfio_noiommu_attach_group(void *iommu_data,
190 		struct iommu_group *iommu_group, enum vfio_group_type type)
191 {
192 	return 0;
193 }
194 
vfio_noiommu_detach_group(void * iommu_data,struct iommu_group * iommu_group)195 static void vfio_noiommu_detach_group(void *iommu_data,
196 				      struct iommu_group *iommu_group)
197 {
198 }
199 
200 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
201 	.name = "vfio-noiommu",
202 	.owner = THIS_MODULE,
203 	.open = vfio_noiommu_open,
204 	.release = vfio_noiommu_release,
205 	.ioctl = vfio_noiommu_ioctl,
206 	.attach_group = vfio_noiommu_attach_group,
207 	.detach_group = vfio_noiommu_detach_group,
208 };
209 
210 /*
211  * Only noiommu containers can use vfio-noiommu and noiommu containers can only
212  * use vfio-noiommu.
213  */
vfio_iommu_driver_allowed(struct vfio_container * container,const struct vfio_iommu_driver * driver)214 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
215 		const struct vfio_iommu_driver *driver)
216 {
217 	return container->noiommu == (driver->ops == &vfio_noiommu_ops);
218 }
219 #else
vfio_iommu_driver_allowed(struct vfio_container * container,const struct vfio_iommu_driver * driver)220 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
221 		const struct vfio_iommu_driver *driver)
222 {
223 	return true;
224 }
225 #endif /* CONFIG_VFIO_NOIOMMU */
226 
227 /*
228  * IOMMU driver registration
229  */
vfio_register_iommu_driver(const struct vfio_iommu_driver_ops * ops)230 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
231 {
232 	struct vfio_iommu_driver *driver, *tmp;
233 
234 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
235 	if (!driver)
236 		return -ENOMEM;
237 
238 	driver->ops = ops;
239 
240 	mutex_lock(&vfio.iommu_drivers_lock);
241 
242 	/* Check for duplicates */
243 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
244 		if (tmp->ops == ops) {
245 			mutex_unlock(&vfio.iommu_drivers_lock);
246 			kfree(driver);
247 			return -EINVAL;
248 		}
249 	}
250 
251 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
252 
253 	mutex_unlock(&vfio.iommu_drivers_lock);
254 
255 	return 0;
256 }
257 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
258 
vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops * ops)259 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
260 {
261 	struct vfio_iommu_driver *driver;
262 
263 	mutex_lock(&vfio.iommu_drivers_lock);
264 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
265 		if (driver->ops == ops) {
266 			list_del(&driver->vfio_next);
267 			mutex_unlock(&vfio.iommu_drivers_lock);
268 			kfree(driver);
269 			return;
270 		}
271 	}
272 	mutex_unlock(&vfio.iommu_drivers_lock);
273 }
274 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
275 
276 static void vfio_group_get(struct vfio_group *group);
277 
278 /*
279  * Container objects - containers are created when /dev/vfio/vfio is
280  * opened, but their lifecycle extends until the last user is done, so
281  * it's freed via kref.  Must support container/group/device being
282  * closed in any order.
283  */
vfio_container_get(struct vfio_container * container)284 static void vfio_container_get(struct vfio_container *container)
285 {
286 	kref_get(&container->kref);
287 }
288 
vfio_container_release(struct kref * kref)289 static void vfio_container_release(struct kref *kref)
290 {
291 	struct vfio_container *container;
292 	container = container_of(kref, struct vfio_container, kref);
293 
294 	kfree(container);
295 }
296 
vfio_container_put(struct vfio_container * container)297 static void vfio_container_put(struct vfio_container *container)
298 {
299 	kref_put(&container->kref, vfio_container_release);
300 }
301 
302 /*
303  * Group objects - create, release, get, put, search
304  */
305 static struct vfio_group *
__vfio_group_get_from_iommu(struct iommu_group * iommu_group)306 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
307 {
308 	struct vfio_group *group;
309 
310 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
311 		if (group->iommu_group == iommu_group) {
312 			vfio_group_get(group);
313 			return group;
314 		}
315 	}
316 	return NULL;
317 }
318 
319 static struct vfio_group *
vfio_group_get_from_iommu(struct iommu_group * iommu_group)320 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
321 {
322 	struct vfio_group *group;
323 
324 	mutex_lock(&vfio.group_lock);
325 	group = __vfio_group_get_from_iommu(iommu_group);
326 	mutex_unlock(&vfio.group_lock);
327 	return group;
328 }
329 
vfio_group_release(struct device * dev)330 static void vfio_group_release(struct device *dev)
331 {
332 	struct vfio_group *group = container_of(dev, struct vfio_group, dev);
333 
334 	mutex_destroy(&group->device_lock);
335 	iommu_group_put(group->iommu_group);
336 	ida_free(&vfio.group_ida, MINOR(group->dev.devt));
337 	kfree(group);
338 }
339 
vfio_group_alloc(struct iommu_group * iommu_group,enum vfio_group_type type)340 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
341 					   enum vfio_group_type type)
342 {
343 	struct vfio_group *group;
344 	int minor;
345 
346 	group = kzalloc(sizeof(*group), GFP_KERNEL);
347 	if (!group)
348 		return ERR_PTR(-ENOMEM);
349 
350 	minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
351 	if (minor < 0) {
352 		kfree(group);
353 		return ERR_PTR(minor);
354 	}
355 
356 	device_initialize(&group->dev);
357 	group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
358 	group->dev.class = vfio.class;
359 	group->dev.release = vfio_group_release;
360 	cdev_init(&group->cdev, &vfio_group_fops);
361 	group->cdev.owner = THIS_MODULE;
362 
363 	refcount_set(&group->users, 1);
364 	init_rwsem(&group->group_rwsem);
365 	INIT_LIST_HEAD(&group->device_list);
366 	mutex_init(&group->device_lock);
367 	group->iommu_group = iommu_group;
368 	/* put in vfio_group_release() */
369 	iommu_group_ref_get(iommu_group);
370 	group->type = type;
371 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
372 
373 	return group;
374 }
375 
vfio_create_group(struct iommu_group * iommu_group,enum vfio_group_type type)376 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
377 		enum vfio_group_type type)
378 {
379 	struct vfio_group *group;
380 	struct vfio_group *ret;
381 	int err;
382 
383 	group = vfio_group_alloc(iommu_group, type);
384 	if (IS_ERR(group))
385 		return group;
386 
387 	err = dev_set_name(&group->dev, "%s%d",
388 			   group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
389 			   iommu_group_id(iommu_group));
390 	if (err) {
391 		ret = ERR_PTR(err);
392 		goto err_put;
393 	}
394 
395 	mutex_lock(&vfio.group_lock);
396 
397 	/* Did we race creating this group? */
398 	ret = __vfio_group_get_from_iommu(iommu_group);
399 	if (ret)
400 		goto err_unlock;
401 
402 	err = cdev_device_add(&group->cdev, &group->dev);
403 	if (err) {
404 		ret = ERR_PTR(err);
405 		goto err_unlock;
406 	}
407 
408 	list_add(&group->vfio_next, &vfio.group_list);
409 
410 	mutex_unlock(&vfio.group_lock);
411 	return group;
412 
413 err_unlock:
414 	mutex_unlock(&vfio.group_lock);
415 err_put:
416 	put_device(&group->dev);
417 	return ret;
418 }
419 
vfio_group_put(struct vfio_group * group)420 static void vfio_group_put(struct vfio_group *group)
421 {
422 	if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
423 		return;
424 
425 	/*
426 	 * These data structures all have paired operations that can only be
427 	 * undone when the caller holds a live reference on the group. Since all
428 	 * pairs must be undone these WARN_ON's indicate some caller did not
429 	 * properly hold the group reference.
430 	 */
431 	WARN_ON(!list_empty(&group->device_list));
432 	WARN_ON(group->container || group->container_users);
433 	WARN_ON(group->notifier.head);
434 
435 	list_del(&group->vfio_next);
436 	cdev_device_del(&group->cdev, &group->dev);
437 	mutex_unlock(&vfio.group_lock);
438 
439 	put_device(&group->dev);
440 }
441 
vfio_group_get(struct vfio_group * group)442 static void vfio_group_get(struct vfio_group *group)
443 {
444 	refcount_inc(&group->users);
445 }
446 
447 /*
448  * Device objects - create, release, get, put, search
449  */
450 /* Device reference always implies a group reference */
vfio_device_put(struct vfio_device * device)451 static void vfio_device_put(struct vfio_device *device)
452 {
453 	if (refcount_dec_and_test(&device->refcount))
454 		complete(&device->comp);
455 }
456 
vfio_device_try_get(struct vfio_device * device)457 static bool vfio_device_try_get(struct vfio_device *device)
458 {
459 	return refcount_inc_not_zero(&device->refcount);
460 }
461 
vfio_group_get_device(struct vfio_group * group,struct device * dev)462 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
463 						 struct device *dev)
464 {
465 	struct vfio_device *device;
466 
467 	mutex_lock(&group->device_lock);
468 	list_for_each_entry(device, &group->device_list, group_next) {
469 		if (device->dev == dev && vfio_device_try_get(device)) {
470 			mutex_unlock(&group->device_lock);
471 			return device;
472 		}
473 	}
474 	mutex_unlock(&group->device_lock);
475 	return NULL;
476 }
477 
478 /*
479  * VFIO driver API
480  */
vfio_init_group_dev(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)481 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
482 			 const struct vfio_device_ops *ops)
483 {
484 	init_completion(&device->comp);
485 	device->dev = dev;
486 	device->ops = ops;
487 }
488 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
489 
vfio_uninit_group_dev(struct vfio_device * device)490 void vfio_uninit_group_dev(struct vfio_device *device)
491 {
492 	vfio_release_device_set(device);
493 }
494 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
495 
vfio_noiommu_group_alloc(struct device * dev,enum vfio_group_type type)496 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
497 		enum vfio_group_type type)
498 {
499 	struct iommu_group *iommu_group;
500 	struct vfio_group *group;
501 	int ret;
502 
503 	iommu_group = iommu_group_alloc();
504 	if (IS_ERR(iommu_group))
505 		return ERR_CAST(iommu_group);
506 
507 	iommu_group_set_name(iommu_group, "vfio-noiommu");
508 	ret = iommu_group_add_device(iommu_group, dev);
509 	if (ret)
510 		goto out_put_group;
511 
512 	group = vfio_create_group(iommu_group, type);
513 	if (IS_ERR(group)) {
514 		ret = PTR_ERR(group);
515 		goto out_remove_device;
516 	}
517 	iommu_group_put(iommu_group);
518 	return group;
519 
520 out_remove_device:
521 	iommu_group_remove_device(dev);
522 out_put_group:
523 	iommu_group_put(iommu_group);
524 	return ERR_PTR(ret);
525 }
526 
vfio_group_find_or_alloc(struct device * dev)527 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
528 {
529 	struct iommu_group *iommu_group;
530 	struct vfio_group *group;
531 
532 	iommu_group = iommu_group_get(dev);
533 #ifdef CONFIG_VFIO_NOIOMMU
534 	if (!iommu_group && noiommu) {
535 		/*
536 		 * With noiommu enabled, create an IOMMU group for devices that
537 		 * don't already have one, implying no IOMMU hardware/driver
538 		 * exists.  Taint the kernel because we're about to give a DMA
539 		 * capable device to a user without IOMMU protection.
540 		 */
541 		group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
542 		if (!IS_ERR(group)) {
543 			add_taint(TAINT_USER, LOCKDEP_STILL_OK);
544 			dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
545 		}
546 		return group;
547 	}
548 #endif
549 	if (!iommu_group)
550 		return ERR_PTR(-EINVAL);
551 
552 	/*
553 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
554 	 * restore cache coherency. It has to be checked here because it is only
555 	 * valid for cases where we are using iommu groups.
556 	 */
557 	if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) {
558 		iommu_group_put(iommu_group);
559 		return ERR_PTR(-EINVAL);
560 	}
561 
562 	group = vfio_group_get_from_iommu(iommu_group);
563 	if (!group)
564 		group = vfio_create_group(iommu_group, VFIO_IOMMU);
565 
566 	/* The vfio_group holds a reference to the iommu_group */
567 	iommu_group_put(iommu_group);
568 	return group;
569 }
570 
__vfio_register_dev(struct vfio_device * device,struct vfio_group * group)571 static int __vfio_register_dev(struct vfio_device *device,
572 		struct vfio_group *group)
573 {
574 	struct vfio_device *existing_device;
575 
576 	if (IS_ERR(group))
577 		return PTR_ERR(group);
578 
579 	/*
580 	 * If the driver doesn't specify a set then the device is added to a
581 	 * singleton set just for itself.
582 	 */
583 	if (!device->dev_set)
584 		vfio_assign_device_set(device, device);
585 
586 	existing_device = vfio_group_get_device(group, device->dev);
587 	if (existing_device) {
588 		dev_WARN(device->dev, "Device already exists on group %d\n",
589 			 iommu_group_id(group->iommu_group));
590 		vfio_device_put(existing_device);
591 		if (group->type == VFIO_NO_IOMMU ||
592 		    group->type == VFIO_EMULATED_IOMMU)
593 			iommu_group_remove_device(device->dev);
594 		vfio_group_put(group);
595 		return -EBUSY;
596 	}
597 
598 	/* Our reference on group is moved to the device */
599 	device->group = group;
600 
601 	/* Refcounting can't start until the driver calls register */
602 	refcount_set(&device->refcount, 1);
603 
604 	mutex_lock(&group->device_lock);
605 	list_add(&device->group_next, &group->device_list);
606 	group->dev_counter++;
607 	mutex_unlock(&group->device_lock);
608 
609 	return 0;
610 }
611 
vfio_register_group_dev(struct vfio_device * device)612 int vfio_register_group_dev(struct vfio_device *device)
613 {
614 	return __vfio_register_dev(device,
615 		vfio_group_find_or_alloc(device->dev));
616 }
617 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
618 
619 /*
620  * Register a virtual device without IOMMU backing.  The user of this
621  * device must not be able to directly trigger unmediated DMA.
622  */
vfio_register_emulated_iommu_dev(struct vfio_device * device)623 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
624 {
625 	return __vfio_register_dev(device,
626 		vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
627 }
628 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
629 
vfio_device_get_from_name(struct vfio_group * group,char * buf)630 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
631 						     char *buf)
632 {
633 	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
634 
635 	mutex_lock(&group->device_lock);
636 	list_for_each_entry(it, &group->device_list, group_next) {
637 		int ret;
638 
639 		if (it->ops->match) {
640 			ret = it->ops->match(it, buf);
641 			if (ret < 0) {
642 				device = ERR_PTR(ret);
643 				break;
644 			}
645 		} else {
646 			ret = !strcmp(dev_name(it->dev), buf);
647 		}
648 
649 		if (ret && vfio_device_try_get(it)) {
650 			device = it;
651 			break;
652 		}
653 	}
654 	mutex_unlock(&group->device_lock);
655 
656 	return device;
657 }
658 
659 /*
660  * Decrement the device reference count and wait for the device to be
661  * removed.  Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)662 void vfio_unregister_group_dev(struct vfio_device *device)
663 {
664 	struct vfio_group *group = device->group;
665 	unsigned int i = 0;
666 	bool interrupted = false;
667 	long rc;
668 
669 	vfio_device_put(device);
670 	rc = try_wait_for_completion(&device->comp);
671 	while (rc <= 0) {
672 		if (device->ops->request)
673 			device->ops->request(device, i++);
674 
675 		if (interrupted) {
676 			rc = wait_for_completion_timeout(&device->comp,
677 							 HZ * 10);
678 		} else {
679 			rc = wait_for_completion_interruptible_timeout(
680 				&device->comp, HZ * 10);
681 			if (rc < 0) {
682 				interrupted = true;
683 				dev_warn(device->dev,
684 					 "Device is currently in use, task"
685 					 " \"%s\" (%d) "
686 					 "blocked until device is released",
687 					 current->comm, task_pid_nr(current));
688 			}
689 		}
690 	}
691 
692 	mutex_lock(&group->device_lock);
693 	list_del(&device->group_next);
694 	group->dev_counter--;
695 	mutex_unlock(&group->device_lock);
696 
697 	if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
698 		iommu_group_remove_device(device->dev);
699 
700 	/* Matches the get in vfio_register_group_dev() */
701 	vfio_group_put(group);
702 }
703 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
704 
705 /*
706  * VFIO base fd, /dev/vfio/vfio
707  */
vfio_ioctl_check_extension(struct vfio_container * container,unsigned long arg)708 static long vfio_ioctl_check_extension(struct vfio_container *container,
709 				       unsigned long arg)
710 {
711 	struct vfio_iommu_driver *driver;
712 	long ret = 0;
713 
714 	down_read(&container->group_lock);
715 
716 	driver = container->iommu_driver;
717 
718 	switch (arg) {
719 		/* No base extensions yet */
720 	default:
721 		/*
722 		 * If no driver is set, poll all registered drivers for
723 		 * extensions and return the first positive result.  If
724 		 * a driver is already set, further queries will be passed
725 		 * only to that driver.
726 		 */
727 		if (!driver) {
728 			mutex_lock(&vfio.iommu_drivers_lock);
729 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
730 					    vfio_next) {
731 
732 				if (!list_empty(&container->group_list) &&
733 				    !vfio_iommu_driver_allowed(container,
734 							       driver))
735 					continue;
736 				if (!try_module_get(driver->ops->owner))
737 					continue;
738 
739 				ret = driver->ops->ioctl(NULL,
740 							 VFIO_CHECK_EXTENSION,
741 							 arg);
742 				module_put(driver->ops->owner);
743 				if (ret > 0)
744 					break;
745 			}
746 			mutex_unlock(&vfio.iommu_drivers_lock);
747 		} else
748 			ret = driver->ops->ioctl(container->iommu_data,
749 						 VFIO_CHECK_EXTENSION, arg);
750 	}
751 
752 	up_read(&container->group_lock);
753 
754 	return ret;
755 }
756 
757 /* hold write lock on container->group_lock */
__vfio_container_attach_groups(struct vfio_container * container,struct vfio_iommu_driver * driver,void * data)758 static int __vfio_container_attach_groups(struct vfio_container *container,
759 					  struct vfio_iommu_driver *driver,
760 					  void *data)
761 {
762 	struct vfio_group *group;
763 	int ret = -ENODEV;
764 
765 	list_for_each_entry(group, &container->group_list, container_next) {
766 		ret = driver->ops->attach_group(data, group->iommu_group,
767 						group->type);
768 		if (ret)
769 			goto unwind;
770 	}
771 
772 	return ret;
773 
774 unwind:
775 	list_for_each_entry_continue_reverse(group, &container->group_list,
776 					     container_next) {
777 		driver->ops->detach_group(data, group->iommu_group);
778 	}
779 
780 	return ret;
781 }
782 
vfio_ioctl_set_iommu(struct vfio_container * container,unsigned long arg)783 static long vfio_ioctl_set_iommu(struct vfio_container *container,
784 				 unsigned long arg)
785 {
786 	struct vfio_iommu_driver *driver;
787 	long ret = -ENODEV;
788 
789 	down_write(&container->group_lock);
790 
791 	/*
792 	 * The container is designed to be an unprivileged interface while
793 	 * the group can be assigned to specific users.  Therefore, only by
794 	 * adding a group to a container does the user get the privilege of
795 	 * enabling the iommu, which may allocate finite resources.  There
796 	 * is no unset_iommu, but by removing all the groups from a container,
797 	 * the container is deprivileged and returns to an unset state.
798 	 */
799 	if (list_empty(&container->group_list) || container->iommu_driver) {
800 		up_write(&container->group_lock);
801 		return -EINVAL;
802 	}
803 
804 	mutex_lock(&vfio.iommu_drivers_lock);
805 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
806 		void *data;
807 
808 		if (!vfio_iommu_driver_allowed(container, driver))
809 			continue;
810 		if (!try_module_get(driver->ops->owner))
811 			continue;
812 
813 		/*
814 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
815 		 * so test which iommu driver reported support for this
816 		 * extension and call open on them.  We also pass them the
817 		 * magic, allowing a single driver to support multiple
818 		 * interfaces if they'd like.
819 		 */
820 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
821 			module_put(driver->ops->owner);
822 			continue;
823 		}
824 
825 		data = driver->ops->open(arg);
826 		if (IS_ERR(data)) {
827 			ret = PTR_ERR(data);
828 			module_put(driver->ops->owner);
829 			continue;
830 		}
831 
832 		ret = __vfio_container_attach_groups(container, driver, data);
833 		if (ret) {
834 			driver->ops->release(data);
835 			module_put(driver->ops->owner);
836 			continue;
837 		}
838 
839 		container->iommu_driver = driver;
840 		container->iommu_data = data;
841 		break;
842 	}
843 
844 	mutex_unlock(&vfio.iommu_drivers_lock);
845 	up_write(&container->group_lock);
846 
847 	return ret;
848 }
849 
vfio_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)850 static long vfio_fops_unl_ioctl(struct file *filep,
851 				unsigned int cmd, unsigned long arg)
852 {
853 	struct vfio_container *container = filep->private_data;
854 	struct vfio_iommu_driver *driver;
855 	void *data;
856 	long ret = -EINVAL;
857 
858 	if (!container)
859 		return ret;
860 
861 	switch (cmd) {
862 	case VFIO_GET_API_VERSION:
863 		ret = VFIO_API_VERSION;
864 		break;
865 	case VFIO_CHECK_EXTENSION:
866 		ret = vfio_ioctl_check_extension(container, arg);
867 		break;
868 	case VFIO_SET_IOMMU:
869 		ret = vfio_ioctl_set_iommu(container, arg);
870 		break;
871 	default:
872 		driver = container->iommu_driver;
873 		data = container->iommu_data;
874 
875 		if (driver) /* passthrough all unrecognized ioctls */
876 			ret = driver->ops->ioctl(data, cmd, arg);
877 	}
878 
879 	return ret;
880 }
881 
vfio_fops_open(struct inode * inode,struct file * filep)882 static int vfio_fops_open(struct inode *inode, struct file *filep)
883 {
884 	struct vfio_container *container;
885 
886 	container = kzalloc(sizeof(*container), GFP_KERNEL);
887 	if (!container)
888 		return -ENOMEM;
889 
890 	INIT_LIST_HEAD(&container->group_list);
891 	init_rwsem(&container->group_lock);
892 	kref_init(&container->kref);
893 
894 	filep->private_data = container;
895 
896 	return 0;
897 }
898 
vfio_fops_release(struct inode * inode,struct file * filep)899 static int vfio_fops_release(struct inode *inode, struct file *filep)
900 {
901 	struct vfio_container *container = filep->private_data;
902 	struct vfio_iommu_driver *driver = container->iommu_driver;
903 
904 	if (driver && driver->ops->notify)
905 		driver->ops->notify(container->iommu_data,
906 				    VFIO_IOMMU_CONTAINER_CLOSE);
907 
908 	filep->private_data = NULL;
909 
910 	vfio_container_put(container);
911 
912 	return 0;
913 }
914 
915 static const struct file_operations vfio_fops = {
916 	.owner		= THIS_MODULE,
917 	.open		= vfio_fops_open,
918 	.release	= vfio_fops_release,
919 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
920 	.compat_ioctl	= compat_ptr_ioctl,
921 };
922 
923 /*
924  * VFIO Group fd, /dev/vfio/$GROUP
925  */
__vfio_group_unset_container(struct vfio_group * group)926 static void __vfio_group_unset_container(struct vfio_group *group)
927 {
928 	struct vfio_container *container = group->container;
929 	struct vfio_iommu_driver *driver;
930 
931 	lockdep_assert_held_write(&group->group_rwsem);
932 
933 	down_write(&container->group_lock);
934 
935 	driver = container->iommu_driver;
936 	if (driver)
937 		driver->ops->detach_group(container->iommu_data,
938 					  group->iommu_group);
939 
940 	if (group->type == VFIO_IOMMU)
941 		iommu_group_release_dma_owner(group->iommu_group);
942 
943 	group->container = NULL;
944 	group->container_users = 0;
945 	list_del(&group->container_next);
946 
947 	/* Detaching the last group deprivileges a container, remove iommu */
948 	if (driver && list_empty(&container->group_list)) {
949 		driver->ops->release(container->iommu_data);
950 		module_put(driver->ops->owner);
951 		container->iommu_driver = NULL;
952 		container->iommu_data = NULL;
953 	}
954 
955 	up_write(&container->group_lock);
956 
957 	vfio_container_put(container);
958 }
959 
960 /*
961  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
962  * if there was no container to unset.  Since the ioctl is called on
963  * the group, we know that still exists, therefore the only valid
964  * transition here is 1->0.
965  */
vfio_group_unset_container(struct vfio_group * group)966 static int vfio_group_unset_container(struct vfio_group *group)
967 {
968 	lockdep_assert_held_write(&group->group_rwsem);
969 
970 	if (!group->container)
971 		return -EINVAL;
972 	if (group->container_users != 1)
973 		return -EBUSY;
974 	__vfio_group_unset_container(group);
975 	return 0;
976 }
977 
vfio_group_set_container(struct vfio_group * group,int container_fd)978 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
979 {
980 	struct fd f;
981 	struct vfio_container *container;
982 	struct vfio_iommu_driver *driver;
983 	int ret = 0;
984 
985 	lockdep_assert_held_write(&group->group_rwsem);
986 
987 	if (group->container || WARN_ON(group->container_users))
988 		return -EINVAL;
989 
990 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
991 		return -EPERM;
992 
993 	f = fdget(container_fd);
994 	if (!f.file)
995 		return -EBADF;
996 
997 	/* Sanity check, is this really our fd? */
998 	if (f.file->f_op != &vfio_fops) {
999 		fdput(f);
1000 		return -EINVAL;
1001 	}
1002 
1003 	container = f.file->private_data;
1004 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1005 
1006 	down_write(&container->group_lock);
1007 
1008 	/* Real groups and fake groups cannot mix */
1009 	if (!list_empty(&container->group_list) &&
1010 	    container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1011 		ret = -EPERM;
1012 		goto unlock_out;
1013 	}
1014 
1015 	if (group->type == VFIO_IOMMU) {
1016 		ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1017 		if (ret)
1018 			goto unlock_out;
1019 	}
1020 
1021 	driver = container->iommu_driver;
1022 	if (driver) {
1023 		ret = driver->ops->attach_group(container->iommu_data,
1024 						group->iommu_group,
1025 						group->type);
1026 		if (ret) {
1027 			if (group->type == VFIO_IOMMU)
1028 				iommu_group_release_dma_owner(
1029 					group->iommu_group);
1030 			goto unlock_out;
1031 		}
1032 	}
1033 
1034 	group->container = container;
1035 	group->container_users = 1;
1036 	container->noiommu = (group->type == VFIO_NO_IOMMU);
1037 	list_add(&group->container_next, &container->group_list);
1038 
1039 	/* Get a reference on the container and mark a user within the group */
1040 	vfio_container_get(container);
1041 
1042 unlock_out:
1043 	up_write(&container->group_lock);
1044 	fdput(f);
1045 	return ret;
1046 }
1047 
1048 static const struct file_operations vfio_device_fops;
1049 
1050 /* true if the vfio_device has open_device() called but not close_device() */
vfio_assert_device_open(struct vfio_device * device)1051 static bool vfio_assert_device_open(struct vfio_device *device)
1052 {
1053 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1054 }
1055 
vfio_device_assign_container(struct vfio_device * device)1056 static int vfio_device_assign_container(struct vfio_device *device)
1057 {
1058 	struct vfio_group *group = device->group;
1059 
1060 	lockdep_assert_held_write(&group->group_rwsem);
1061 
1062 	if (!group->container || !group->container->iommu_driver ||
1063 	    WARN_ON(!group->container_users))
1064 		return -EINVAL;
1065 
1066 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1067 		return -EPERM;
1068 
1069 	get_file(group->opened_file);
1070 	group->container_users++;
1071 	return 0;
1072 }
1073 
vfio_device_unassign_container(struct vfio_device * device)1074 static void vfio_device_unassign_container(struct vfio_device *device)
1075 {
1076 	down_write(&device->group->group_rwsem);
1077 	WARN_ON(device->group->container_users <= 1);
1078 	device->group->container_users--;
1079 	fput(device->group->opened_file);
1080 	up_write(&device->group->group_rwsem);
1081 }
1082 
vfio_device_open(struct vfio_device * device)1083 static struct file *vfio_device_open(struct vfio_device *device)
1084 {
1085 	struct file *filep;
1086 	int ret;
1087 
1088 	down_write(&device->group->group_rwsem);
1089 	ret = vfio_device_assign_container(device);
1090 	up_write(&device->group->group_rwsem);
1091 	if (ret)
1092 		return ERR_PTR(ret);
1093 
1094 	if (!try_module_get(device->dev->driver->owner)) {
1095 		ret = -ENODEV;
1096 		goto err_unassign_container;
1097 	}
1098 
1099 	mutex_lock(&device->dev_set->lock);
1100 	device->open_count++;
1101 	if (device->open_count == 1) {
1102 		/*
1103 		 * Here we pass the KVM pointer with the group under the read
1104 		 * lock.  If the device driver will use it, it must obtain a
1105 		 * reference and release it during close_device.
1106 		 */
1107 		down_read(&device->group->group_rwsem);
1108 		device->kvm = device->group->kvm;
1109 
1110 		if (device->ops->open_device) {
1111 			ret = device->ops->open_device(device);
1112 			if (ret)
1113 				goto err_undo_count;
1114 		}
1115 		up_read(&device->group->group_rwsem);
1116 	}
1117 	mutex_unlock(&device->dev_set->lock);
1118 
1119 	/*
1120 	 * We can't use anon_inode_getfd() because we need to modify
1121 	 * the f_mode flags directly to allow more than just ioctls
1122 	 */
1123 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1124 				   device, O_RDWR);
1125 	if (IS_ERR(filep)) {
1126 		ret = PTR_ERR(filep);
1127 		goto err_close_device;
1128 	}
1129 
1130 	/*
1131 	 * TODO: add an anon_inode interface to do this.
1132 	 * Appears to be missing by lack of need rather than
1133 	 * explicitly prevented.  Now there's need.
1134 	 */
1135 	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1136 
1137 	if (device->group->type == VFIO_NO_IOMMU)
1138 		dev_warn(device->dev, "vfio-noiommu device opened by user "
1139 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1140 	/*
1141 	 * On success the ref of device is moved to the file and
1142 	 * put in vfio_device_fops_release()
1143 	 */
1144 	return filep;
1145 
1146 err_close_device:
1147 	mutex_lock(&device->dev_set->lock);
1148 	down_read(&device->group->group_rwsem);
1149 	if (device->open_count == 1 && device->ops->close_device)
1150 		device->ops->close_device(device);
1151 err_undo_count:
1152 	device->open_count--;
1153 	if (device->open_count == 0 && device->kvm)
1154 		device->kvm = NULL;
1155 	up_read(&device->group->group_rwsem);
1156 	mutex_unlock(&device->dev_set->lock);
1157 	module_put(device->dev->driver->owner);
1158 err_unassign_container:
1159 	vfio_device_unassign_container(device);
1160 	return ERR_PTR(ret);
1161 }
1162 
vfio_group_get_device_fd(struct vfio_group * group,char * buf)1163 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1164 {
1165 	struct vfio_device *device;
1166 	struct file *filep;
1167 	int fdno;
1168 	int ret;
1169 
1170 	device = vfio_device_get_from_name(group, buf);
1171 	if (IS_ERR(device))
1172 		return PTR_ERR(device);
1173 
1174 	fdno = get_unused_fd_flags(O_CLOEXEC);
1175 	if (fdno < 0) {
1176 		ret = fdno;
1177 		goto err_put_device;
1178 	}
1179 
1180 	filep = vfio_device_open(device);
1181 	if (IS_ERR(filep)) {
1182 		ret = PTR_ERR(filep);
1183 		goto err_put_fdno;
1184 	}
1185 
1186 	fd_install(fdno, filep);
1187 	return fdno;
1188 
1189 err_put_fdno:
1190 	put_unused_fd(fdno);
1191 err_put_device:
1192 	vfio_device_put(device);
1193 	return ret;
1194 }
1195 
vfio_group_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1196 static long vfio_group_fops_unl_ioctl(struct file *filep,
1197 				      unsigned int cmd, unsigned long arg)
1198 {
1199 	struct vfio_group *group = filep->private_data;
1200 	long ret = -ENOTTY;
1201 
1202 	switch (cmd) {
1203 	case VFIO_GROUP_GET_STATUS:
1204 	{
1205 		struct vfio_group_status status;
1206 		unsigned long minsz;
1207 
1208 		minsz = offsetofend(struct vfio_group_status, flags);
1209 
1210 		if (copy_from_user(&status, (void __user *)arg, minsz))
1211 			return -EFAULT;
1212 
1213 		if (status.argsz < minsz)
1214 			return -EINVAL;
1215 
1216 		status.flags = 0;
1217 
1218 		down_read(&group->group_rwsem);
1219 		if (group->container)
1220 			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1221 					VFIO_GROUP_FLAGS_VIABLE;
1222 		else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1223 			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1224 		up_read(&group->group_rwsem);
1225 
1226 		if (copy_to_user((void __user *)arg, &status, minsz))
1227 			return -EFAULT;
1228 
1229 		ret = 0;
1230 		break;
1231 	}
1232 	case VFIO_GROUP_SET_CONTAINER:
1233 	{
1234 		int fd;
1235 
1236 		if (get_user(fd, (int __user *)arg))
1237 			return -EFAULT;
1238 
1239 		if (fd < 0)
1240 			return -EINVAL;
1241 
1242 		down_write(&group->group_rwsem);
1243 		ret = vfio_group_set_container(group, fd);
1244 		up_write(&group->group_rwsem);
1245 		break;
1246 	}
1247 	case VFIO_GROUP_UNSET_CONTAINER:
1248 		down_write(&group->group_rwsem);
1249 		ret = vfio_group_unset_container(group);
1250 		up_write(&group->group_rwsem);
1251 		break;
1252 	case VFIO_GROUP_GET_DEVICE_FD:
1253 	{
1254 		char *buf;
1255 
1256 		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1257 		if (IS_ERR(buf))
1258 			return PTR_ERR(buf);
1259 
1260 		ret = vfio_group_get_device_fd(group, buf);
1261 		kfree(buf);
1262 		break;
1263 	}
1264 	}
1265 
1266 	return ret;
1267 }
1268 
vfio_group_fops_open(struct inode * inode,struct file * filep)1269 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1270 {
1271 	struct vfio_group *group =
1272 		container_of(inode->i_cdev, struct vfio_group, cdev);
1273 	int ret;
1274 
1275 	down_write(&group->group_rwsem);
1276 
1277 	/* users can be zero if this races with vfio_group_put() */
1278 	if (!refcount_inc_not_zero(&group->users)) {
1279 		ret = -ENODEV;
1280 		goto err_unlock;
1281 	}
1282 
1283 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1284 		ret = -EPERM;
1285 		goto err_put;
1286 	}
1287 
1288 	/*
1289 	 * Do we need multiple instances of the group open?  Seems not.
1290 	 */
1291 	if (group->opened_file) {
1292 		ret = -EBUSY;
1293 		goto err_put;
1294 	}
1295 	group->opened_file = filep;
1296 	filep->private_data = group;
1297 
1298 	up_write(&group->group_rwsem);
1299 	return 0;
1300 err_put:
1301 	vfio_group_put(group);
1302 err_unlock:
1303 	up_write(&group->group_rwsem);
1304 	return ret;
1305 }
1306 
vfio_group_fops_release(struct inode * inode,struct file * filep)1307 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1308 {
1309 	struct vfio_group *group = filep->private_data;
1310 
1311 	filep->private_data = NULL;
1312 
1313 	down_write(&group->group_rwsem);
1314 	/*
1315 	 * Device FDs hold a group file reference, therefore the group release
1316 	 * is only called when there are no open devices.
1317 	 */
1318 	WARN_ON(group->notifier.head);
1319 	if (group->container) {
1320 		WARN_ON(group->container_users != 1);
1321 		__vfio_group_unset_container(group);
1322 	}
1323 	group->opened_file = NULL;
1324 	up_write(&group->group_rwsem);
1325 
1326 	vfio_group_put(group);
1327 
1328 	return 0;
1329 }
1330 
1331 static const struct file_operations vfio_group_fops = {
1332 	.owner		= THIS_MODULE,
1333 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1334 	.compat_ioctl	= compat_ptr_ioctl,
1335 	.open		= vfio_group_fops_open,
1336 	.release	= vfio_group_fops_release,
1337 };
1338 
1339 /*
1340  * VFIO Device fd
1341  */
vfio_device_fops_release(struct inode * inode,struct file * filep)1342 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1343 {
1344 	struct vfio_device *device = filep->private_data;
1345 
1346 	mutex_lock(&device->dev_set->lock);
1347 	vfio_assert_device_open(device);
1348 	down_read(&device->group->group_rwsem);
1349 	if (device->open_count == 1 && device->ops->close_device)
1350 		device->ops->close_device(device);
1351 	up_read(&device->group->group_rwsem);
1352 	device->open_count--;
1353 	if (device->open_count == 0)
1354 		device->kvm = NULL;
1355 	mutex_unlock(&device->dev_set->lock);
1356 
1357 	module_put(device->dev->driver->owner);
1358 
1359 	vfio_device_unassign_container(device);
1360 
1361 	vfio_device_put(device);
1362 
1363 	return 0;
1364 }
1365 
1366 /*
1367  * vfio_mig_get_next_state - Compute the next step in the FSM
1368  * @cur_fsm - The current state the device is in
1369  * @new_fsm - The target state to reach
1370  * @next_fsm - Pointer to the next step to get to new_fsm
1371  *
1372  * Return 0 upon success, otherwise -errno
1373  * Upon success the next step in the state progression between cur_fsm and
1374  * new_fsm will be set in next_fsm.
1375  *
1376  * This breaks down requests for combination transitions into smaller steps and
1377  * returns the next step to get to new_fsm. The function may need to be called
1378  * multiple times before reaching new_fsm.
1379  *
1380  */
vfio_mig_get_next_state(struct vfio_device * device,enum vfio_device_mig_state cur_fsm,enum vfio_device_mig_state new_fsm,enum vfio_device_mig_state * next_fsm)1381 int vfio_mig_get_next_state(struct vfio_device *device,
1382 			    enum vfio_device_mig_state cur_fsm,
1383 			    enum vfio_device_mig_state new_fsm,
1384 			    enum vfio_device_mig_state *next_fsm)
1385 {
1386 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1387 	/*
1388 	 * The coding in this table requires the driver to implement the
1389 	 * following FSM arcs:
1390 	 *         RESUMING -> STOP
1391 	 *         STOP -> RESUMING
1392 	 *         STOP -> STOP_COPY
1393 	 *         STOP_COPY -> STOP
1394 	 *
1395 	 * If P2P is supported then the driver must also implement these FSM
1396 	 * arcs:
1397 	 *         RUNNING -> RUNNING_P2P
1398 	 *         RUNNING_P2P -> RUNNING
1399 	 *         RUNNING_P2P -> STOP
1400 	 *         STOP -> RUNNING_P2P
1401 	 * Without P2P the driver must implement:
1402 	 *         RUNNING -> STOP
1403 	 *         STOP -> RUNNING
1404 	 *
1405 	 * The coding will step through multiple states for some combination
1406 	 * transitions; if all optional features are supported, this means the
1407 	 * following ones:
1408 	 *         RESUMING -> STOP -> RUNNING_P2P
1409 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1410 	 *         RESUMING -> STOP -> STOP_COPY
1411 	 *         RUNNING -> RUNNING_P2P -> STOP
1412 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1413 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1414 	 *         RUNNING_P2P -> STOP -> RESUMING
1415 	 *         RUNNING_P2P -> STOP -> STOP_COPY
1416 	 *         STOP -> RUNNING_P2P -> RUNNING
1417 	 *         STOP_COPY -> STOP -> RESUMING
1418 	 *         STOP_COPY -> STOP -> RUNNING_P2P
1419 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1420 	 */
1421 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1422 		[VFIO_DEVICE_STATE_STOP] = {
1423 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1424 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1425 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1426 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1427 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1428 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1429 		},
1430 		[VFIO_DEVICE_STATE_RUNNING] = {
1431 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1432 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1433 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1434 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1435 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1436 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1437 		},
1438 		[VFIO_DEVICE_STATE_STOP_COPY] = {
1439 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1440 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1441 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1442 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1443 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1444 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1445 		},
1446 		[VFIO_DEVICE_STATE_RESUMING] = {
1447 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1448 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1449 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1450 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1451 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1452 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1453 		},
1454 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
1455 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1456 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1457 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1458 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1459 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1460 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1461 		},
1462 		[VFIO_DEVICE_STATE_ERROR] = {
1463 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1464 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1465 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1466 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1467 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1468 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1469 		},
1470 	};
1471 
1472 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1473 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1474 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1475 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1476 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1477 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
1478 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1479 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
1480 	};
1481 
1482 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1483 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
1484 			state_flags_table[cur_fsm]))
1485 		return -EINVAL;
1486 
1487 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1488 	   (state_flags_table[new_fsm] & device->migration_flags) !=
1489 			state_flags_table[new_fsm])
1490 		return -EINVAL;
1491 
1492 	/*
1493 	 * Arcs touching optional and unsupported states are skipped over. The
1494 	 * driver will instead see an arc from the original state to the next
1495 	 * logical state, as per the above comment.
1496 	 */
1497 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1498 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1499 			state_flags_table[*next_fsm])
1500 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1501 
1502 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1503 }
1504 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1505 
1506 /*
1507  * Convert the drivers's struct file into a FD number and return it to userspace
1508  */
vfio_ioct_mig_return_fd(struct file * filp,void __user * arg,struct vfio_device_feature_mig_state * mig)1509 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1510 				   struct vfio_device_feature_mig_state *mig)
1511 {
1512 	int ret;
1513 	int fd;
1514 
1515 	fd = get_unused_fd_flags(O_CLOEXEC);
1516 	if (fd < 0) {
1517 		ret = fd;
1518 		goto out_fput;
1519 	}
1520 
1521 	mig->data_fd = fd;
1522 	if (copy_to_user(arg, mig, sizeof(*mig))) {
1523 		ret = -EFAULT;
1524 		goto out_put_unused;
1525 	}
1526 	fd_install(fd, filp);
1527 	return 0;
1528 
1529 out_put_unused:
1530 	put_unused_fd(fd);
1531 out_fput:
1532 	fput(filp);
1533 	return ret;
1534 }
1535 
1536 static int
vfio_ioctl_device_feature_mig_device_state(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1537 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1538 					   u32 flags, void __user *arg,
1539 					   size_t argsz)
1540 {
1541 	size_t minsz =
1542 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
1543 	struct vfio_device_feature_mig_state mig;
1544 	struct file *filp = NULL;
1545 	int ret;
1546 
1547 	if (!device->mig_ops)
1548 		return -ENOTTY;
1549 
1550 	ret = vfio_check_feature(flags, argsz,
1551 				 VFIO_DEVICE_FEATURE_SET |
1552 				 VFIO_DEVICE_FEATURE_GET,
1553 				 sizeof(mig));
1554 	if (ret != 1)
1555 		return ret;
1556 
1557 	if (copy_from_user(&mig, arg, minsz))
1558 		return -EFAULT;
1559 
1560 	if (flags & VFIO_DEVICE_FEATURE_GET) {
1561 		enum vfio_device_mig_state curr_state;
1562 
1563 		ret = device->mig_ops->migration_get_state(device,
1564 							   &curr_state);
1565 		if (ret)
1566 			return ret;
1567 		mig.device_state = curr_state;
1568 		goto out_copy;
1569 	}
1570 
1571 	/* Handle the VFIO_DEVICE_FEATURE_SET */
1572 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
1573 	if (IS_ERR(filp) || !filp)
1574 		goto out_copy;
1575 
1576 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
1577 out_copy:
1578 	mig.data_fd = -1;
1579 	if (copy_to_user(arg, &mig, sizeof(mig)))
1580 		return -EFAULT;
1581 	if (IS_ERR(filp))
1582 		return PTR_ERR(filp);
1583 	return 0;
1584 }
1585 
vfio_ioctl_device_feature_migration(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1586 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1587 					       u32 flags, void __user *arg,
1588 					       size_t argsz)
1589 {
1590 	struct vfio_device_feature_migration mig = {
1591 		.flags = device->migration_flags,
1592 	};
1593 	int ret;
1594 
1595 	if (!device->mig_ops)
1596 		return -ENOTTY;
1597 
1598 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1599 				 sizeof(mig));
1600 	if (ret != 1)
1601 		return ret;
1602 	if (copy_to_user(arg, &mig, sizeof(mig)))
1603 		return -EFAULT;
1604 	return 0;
1605 }
1606 
vfio_ioctl_device_feature(struct vfio_device * device,struct vfio_device_feature __user * arg)1607 static int vfio_ioctl_device_feature(struct vfio_device *device,
1608 				     struct vfio_device_feature __user *arg)
1609 {
1610 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1611 	struct vfio_device_feature feature;
1612 
1613 	if (copy_from_user(&feature, arg, minsz))
1614 		return -EFAULT;
1615 
1616 	if (feature.argsz < minsz)
1617 		return -EINVAL;
1618 
1619 	/* Check unknown flags */
1620 	if (feature.flags &
1621 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1622 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1623 		return -EINVAL;
1624 
1625 	/* GET & SET are mutually exclusive except with PROBE */
1626 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1627 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1628 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1629 		return -EINVAL;
1630 
1631 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1632 	case VFIO_DEVICE_FEATURE_MIGRATION:
1633 		return vfio_ioctl_device_feature_migration(
1634 			device, feature.flags, arg->data,
1635 			feature.argsz - minsz);
1636 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1637 		return vfio_ioctl_device_feature_mig_device_state(
1638 			device, feature.flags, arg->data,
1639 			feature.argsz - minsz);
1640 	default:
1641 		if (unlikely(!device->ops->device_feature))
1642 			return -EINVAL;
1643 		return device->ops->device_feature(device, feature.flags,
1644 						   arg->data,
1645 						   feature.argsz - minsz);
1646 	}
1647 }
1648 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1649 static long vfio_device_fops_unl_ioctl(struct file *filep,
1650 				       unsigned int cmd, unsigned long arg)
1651 {
1652 	struct vfio_device *device = filep->private_data;
1653 
1654 	switch (cmd) {
1655 	case VFIO_DEVICE_FEATURE:
1656 		return vfio_ioctl_device_feature(device, (void __user *)arg);
1657 	default:
1658 		if (unlikely(!device->ops->ioctl))
1659 			return -EINVAL;
1660 		return device->ops->ioctl(device, cmd, arg);
1661 	}
1662 }
1663 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1664 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1665 				     size_t count, loff_t *ppos)
1666 {
1667 	struct vfio_device *device = filep->private_data;
1668 
1669 	if (unlikely(!device->ops->read))
1670 		return -EINVAL;
1671 
1672 	return device->ops->read(device, buf, count, ppos);
1673 }
1674 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1675 static ssize_t vfio_device_fops_write(struct file *filep,
1676 				      const char __user *buf,
1677 				      size_t count, loff_t *ppos)
1678 {
1679 	struct vfio_device *device = filep->private_data;
1680 
1681 	if (unlikely(!device->ops->write))
1682 		return -EINVAL;
1683 
1684 	return device->ops->write(device, buf, count, ppos);
1685 }
1686 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1687 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1688 {
1689 	struct vfio_device *device = filep->private_data;
1690 
1691 	if (unlikely(!device->ops->mmap))
1692 		return -EINVAL;
1693 
1694 	return device->ops->mmap(device, vma);
1695 }
1696 
1697 static const struct file_operations vfio_device_fops = {
1698 	.owner		= THIS_MODULE,
1699 	.release	= vfio_device_fops_release,
1700 	.read		= vfio_device_fops_read,
1701 	.write		= vfio_device_fops_write,
1702 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1703 	.compat_ioctl	= compat_ptr_ioctl,
1704 	.mmap		= vfio_device_fops_mmap,
1705 };
1706 
1707 /**
1708  * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
1709  * @file: VFIO group file
1710  *
1711  * The returned iommu_group is valid as long as a ref is held on the file.
1712  */
vfio_file_iommu_group(struct file * file)1713 struct iommu_group *vfio_file_iommu_group(struct file *file)
1714 {
1715 	struct vfio_group *group = file->private_data;
1716 
1717 	if (file->f_op != &vfio_group_fops)
1718 		return NULL;
1719 	return group->iommu_group;
1720 }
1721 EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
1722 
1723 /**
1724  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1725  *        is always CPU cache coherent
1726  * @file: VFIO group file
1727  *
1728  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1729  * bit in DMA transactions. A return of false indicates that the user has
1730  * rights to access additional instructions such as wbinvd on x86.
1731  */
vfio_file_enforced_coherent(struct file * file)1732 bool vfio_file_enforced_coherent(struct file *file)
1733 {
1734 	struct vfio_group *group = file->private_data;
1735 	bool ret;
1736 
1737 	if (file->f_op != &vfio_group_fops)
1738 		return true;
1739 
1740 	down_read(&group->group_rwsem);
1741 	if (group->container) {
1742 		ret = vfio_ioctl_check_extension(group->container,
1743 						 VFIO_DMA_CC_IOMMU);
1744 	} else {
1745 		/*
1746 		 * Since the coherency state is determined only once a container
1747 		 * is attached the user must do so before they can prove they
1748 		 * have permission.
1749 		 */
1750 		ret = true;
1751 	}
1752 	up_read(&group->group_rwsem);
1753 	return ret;
1754 }
1755 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1756 
1757 /**
1758  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1759  * @file: VFIO group file
1760  * @kvm: KVM to link
1761  *
1762  * When a VFIO device is first opened the KVM will be available in
1763  * device->kvm if one was associated with the group.
1764  */
vfio_file_set_kvm(struct file * file,struct kvm * kvm)1765 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1766 {
1767 	struct vfio_group *group = file->private_data;
1768 
1769 	if (file->f_op != &vfio_group_fops)
1770 		return;
1771 
1772 	down_write(&group->group_rwsem);
1773 	group->kvm = kvm;
1774 	up_write(&group->group_rwsem);
1775 }
1776 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1777 
1778 /**
1779  * vfio_file_has_dev - True if the VFIO file is a handle for device
1780  * @file: VFIO file to check
1781  * @device: Device that must be part of the file
1782  *
1783  * Returns true if given file has permission to manipulate the given device.
1784  */
vfio_file_has_dev(struct file * file,struct vfio_device * device)1785 bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
1786 {
1787 	struct vfio_group *group = file->private_data;
1788 
1789 	if (file->f_op != &vfio_group_fops)
1790 		return false;
1791 
1792 	return group == device->group;
1793 }
1794 EXPORT_SYMBOL_GPL(vfio_file_has_dev);
1795 
1796 /*
1797  * Sub-module support
1798  */
1799 /*
1800  * Helper for managing a buffer of info chain capabilities, allocate or
1801  * reallocate a buffer with additional @size, filling in @id and @version
1802  * of the capability.  A pointer to the new capability is returned.
1803  *
1804  * NB. The chain is based at the head of the buffer, so new entries are
1805  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1806  * next offsets prior to copying to the user buffer.
1807  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1808 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1809 					       size_t size, u16 id, u16 version)
1810 {
1811 	void *buf;
1812 	struct vfio_info_cap_header *header, *tmp;
1813 
1814 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1815 	if (!buf) {
1816 		kfree(caps->buf);
1817 		caps->buf = NULL;
1818 		caps->size = 0;
1819 		return ERR_PTR(-ENOMEM);
1820 	}
1821 
1822 	caps->buf = buf;
1823 	header = buf + caps->size;
1824 
1825 	/* Eventually copied to user buffer, zero */
1826 	memset(header, 0, size);
1827 
1828 	header->id = id;
1829 	header->version = version;
1830 
1831 	/* Add to the end of the capability chain */
1832 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1833 		; /* nothing */
1834 
1835 	tmp->next = caps->size;
1836 	caps->size += size;
1837 
1838 	return header;
1839 }
1840 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1841 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1842 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1843 {
1844 	struct vfio_info_cap_header *tmp;
1845 	void *buf = (void *)caps->buf;
1846 
1847 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1848 		tmp->next += offset;
1849 }
1850 EXPORT_SYMBOL(vfio_info_cap_shift);
1851 
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1852 int vfio_info_add_capability(struct vfio_info_cap *caps,
1853 			     struct vfio_info_cap_header *cap, size_t size)
1854 {
1855 	struct vfio_info_cap_header *header;
1856 
1857 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1858 	if (IS_ERR(header))
1859 		return PTR_ERR(header);
1860 
1861 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1862 
1863 	return 0;
1864 }
1865 EXPORT_SYMBOL(vfio_info_add_capability);
1866 
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1867 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1868 				       int max_irq_type, size_t *data_size)
1869 {
1870 	unsigned long minsz;
1871 	size_t size;
1872 
1873 	minsz = offsetofend(struct vfio_irq_set, count);
1874 
1875 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1876 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1877 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1878 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1879 		return -EINVAL;
1880 
1881 	if (data_size)
1882 		*data_size = 0;
1883 
1884 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1885 		return -EINVAL;
1886 
1887 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1888 	case VFIO_IRQ_SET_DATA_NONE:
1889 		size = 0;
1890 		break;
1891 	case VFIO_IRQ_SET_DATA_BOOL:
1892 		size = sizeof(uint8_t);
1893 		break;
1894 	case VFIO_IRQ_SET_DATA_EVENTFD:
1895 		size = sizeof(int32_t);
1896 		break;
1897 	default:
1898 		return -EINVAL;
1899 	}
1900 
1901 	if (size) {
1902 		if (hdr->argsz - minsz < hdr->count * size)
1903 			return -EINVAL;
1904 
1905 		if (!data_size)
1906 			return -EINVAL;
1907 
1908 		*data_size = hdr->count * size;
1909 	}
1910 
1911 	return 0;
1912 }
1913 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1914 
1915 /*
1916  * Pin a set of guest PFNs and return their associated host PFNs for local
1917  * domain only.
1918  * @device [in]  : device
1919  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1920  * @npage [in]   : count of elements in user_pfn array.  This count should not
1921  *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1922  * @prot [in]    : protection flags
1923  * @phys_pfn[out]: array of host PFNs
1924  * Return error or number of pages pinned.
1925  */
vfio_pin_pages(struct vfio_device * device,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)1926 int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
1927 		   int npage, int prot, unsigned long *phys_pfn)
1928 {
1929 	struct vfio_container *container;
1930 	struct vfio_group *group = device->group;
1931 	struct vfio_iommu_driver *driver;
1932 	int ret;
1933 
1934 	if (!user_pfn || !phys_pfn || !npage ||
1935 	    !vfio_assert_device_open(device))
1936 		return -EINVAL;
1937 
1938 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1939 		return -E2BIG;
1940 
1941 	if (group->dev_counter > 1)
1942 		return -EINVAL;
1943 
1944 	/* group->container cannot change while a vfio device is open */
1945 	container = group->container;
1946 	driver = container->iommu_driver;
1947 	if (likely(driver && driver->ops->pin_pages))
1948 		ret = driver->ops->pin_pages(container->iommu_data,
1949 					     group->iommu_group, user_pfn,
1950 					     npage, prot, phys_pfn);
1951 	else
1952 		ret = -ENOTTY;
1953 
1954 	return ret;
1955 }
1956 EXPORT_SYMBOL(vfio_pin_pages);
1957 
1958 /*
1959  * Unpin set of host PFNs for local domain only.
1960  * @device [in]  : device
1961  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1962  *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1963  * @npage [in]   : count of elements in user_pfn array.  This count should not
1964  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1965  * Return error or number of pages unpinned.
1966  */
vfio_unpin_pages(struct vfio_device * device,unsigned long * user_pfn,int npage)1967 int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
1968 		     int npage)
1969 {
1970 	struct vfio_container *container;
1971 	struct vfio_iommu_driver *driver;
1972 	int ret;
1973 
1974 	if (!user_pfn || !npage || !vfio_assert_device_open(device))
1975 		return -EINVAL;
1976 
1977 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1978 		return -E2BIG;
1979 
1980 	/* group->container cannot change while a vfio device is open */
1981 	container = device->group->container;
1982 	driver = container->iommu_driver;
1983 	if (likely(driver && driver->ops->unpin_pages))
1984 		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1985 					       npage);
1986 	else
1987 		ret = -ENOTTY;
1988 
1989 	return ret;
1990 }
1991 EXPORT_SYMBOL(vfio_unpin_pages);
1992 
1993 /*
1994  * This interface allows the CPUs to perform some sort of virtual DMA on
1995  * behalf of the device.
1996  *
1997  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1998  * into/from a kernel buffer.
1999  *
2000  * As the read/write of user space memory is conducted via the CPUs and is
2001  * not a real device DMA, it is not necessary to pin the user space memory.
2002  *
2003  * @device [in]		: VFIO device
2004  * @user_iova [in]	: base IOVA of a user space buffer
2005  * @data [in]		: pointer to kernel buffer
2006  * @len [in]		: kernel buffer length
2007  * @write		: indicate read or write
2008  * Return error code on failure or 0 on success.
2009  */
vfio_dma_rw(struct vfio_device * device,dma_addr_t user_iova,void * data,size_t len,bool write)2010 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
2011 		size_t len, bool write)
2012 {
2013 	struct vfio_container *container;
2014 	struct vfio_iommu_driver *driver;
2015 	int ret = 0;
2016 
2017 	if (!data || len <= 0 || !vfio_assert_device_open(device))
2018 		return -EINVAL;
2019 
2020 	/* group->container cannot change while a vfio device is open */
2021 	container = device->group->container;
2022 	driver = container->iommu_driver;
2023 
2024 	if (likely(driver && driver->ops->dma_rw))
2025 		ret = driver->ops->dma_rw(container->iommu_data,
2026 					  user_iova, data, len, write);
2027 	else
2028 		ret = -ENOTTY;
2029 	return ret;
2030 }
2031 EXPORT_SYMBOL(vfio_dma_rw);
2032 
vfio_register_iommu_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2033 static int vfio_register_iommu_notifier(struct vfio_group *group,
2034 					unsigned long *events,
2035 					struct notifier_block *nb)
2036 {
2037 	struct vfio_container *container;
2038 	struct vfio_iommu_driver *driver;
2039 	int ret;
2040 
2041 	lockdep_assert_held_read(&group->group_rwsem);
2042 
2043 	container = group->container;
2044 	driver = container->iommu_driver;
2045 	if (likely(driver && driver->ops->register_notifier))
2046 		ret = driver->ops->register_notifier(container->iommu_data,
2047 						     events, nb);
2048 	else
2049 		ret = -ENOTTY;
2050 
2051 	return ret;
2052 }
2053 
vfio_unregister_iommu_notifier(struct vfio_group * group,struct notifier_block * nb)2054 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2055 					  struct notifier_block *nb)
2056 {
2057 	struct vfio_container *container;
2058 	struct vfio_iommu_driver *driver;
2059 	int ret;
2060 
2061 	lockdep_assert_held_read(&group->group_rwsem);
2062 
2063 	container = group->container;
2064 	driver = container->iommu_driver;
2065 	if (likely(driver && driver->ops->unregister_notifier))
2066 		ret = driver->ops->unregister_notifier(container->iommu_data,
2067 						       nb);
2068 	else
2069 		ret = -ENOTTY;
2070 
2071 	return ret;
2072 }
2073 
vfio_register_notifier(struct vfio_device * device,enum vfio_notify_type type,unsigned long * events,struct notifier_block * nb)2074 int vfio_register_notifier(struct vfio_device *device,
2075 			   enum vfio_notify_type type, unsigned long *events,
2076 			   struct notifier_block *nb)
2077 {
2078 	struct vfio_group *group = device->group;
2079 	int ret;
2080 
2081 	if (!nb || !events || (*events == 0) ||
2082 	    !vfio_assert_device_open(device))
2083 		return -EINVAL;
2084 
2085 	switch (type) {
2086 	case VFIO_IOMMU_NOTIFY:
2087 		ret = vfio_register_iommu_notifier(group, events, nb);
2088 		break;
2089 	default:
2090 		ret = -EINVAL;
2091 	}
2092 	return ret;
2093 }
2094 EXPORT_SYMBOL(vfio_register_notifier);
2095 
vfio_unregister_notifier(struct vfio_device * device,enum vfio_notify_type type,struct notifier_block * nb)2096 int vfio_unregister_notifier(struct vfio_device *device,
2097 			     enum vfio_notify_type type,
2098 			     struct notifier_block *nb)
2099 {
2100 	struct vfio_group *group = device->group;
2101 	int ret;
2102 
2103 	if (!nb || !vfio_assert_device_open(device))
2104 		return -EINVAL;
2105 
2106 	switch (type) {
2107 	case VFIO_IOMMU_NOTIFY:
2108 		ret = vfio_unregister_iommu_notifier(group, nb);
2109 		break;
2110 	default:
2111 		ret = -EINVAL;
2112 	}
2113 	return ret;
2114 }
2115 EXPORT_SYMBOL(vfio_unregister_notifier);
2116 
2117 /*
2118  * Module/class support
2119  */
vfio_devnode(struct device * dev,umode_t * mode)2120 static char *vfio_devnode(struct device *dev, umode_t *mode)
2121 {
2122 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2123 }
2124 
2125 static struct miscdevice vfio_dev = {
2126 	.minor = VFIO_MINOR,
2127 	.name = "vfio",
2128 	.fops = &vfio_fops,
2129 	.nodename = "vfio/vfio",
2130 	.mode = S_IRUGO | S_IWUGO,
2131 };
2132 
vfio_init(void)2133 static int __init vfio_init(void)
2134 {
2135 	int ret;
2136 
2137 	ida_init(&vfio.group_ida);
2138 	mutex_init(&vfio.group_lock);
2139 	mutex_init(&vfio.iommu_drivers_lock);
2140 	INIT_LIST_HEAD(&vfio.group_list);
2141 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2142 
2143 	ret = misc_register(&vfio_dev);
2144 	if (ret) {
2145 		pr_err("vfio: misc device register failed\n");
2146 		return ret;
2147 	}
2148 
2149 	/* /dev/vfio/$GROUP */
2150 	vfio.class = class_create(THIS_MODULE, "vfio");
2151 	if (IS_ERR(vfio.class)) {
2152 		ret = PTR_ERR(vfio.class);
2153 		goto err_class;
2154 	}
2155 
2156 	vfio.class->devnode = vfio_devnode;
2157 
2158 	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2159 	if (ret)
2160 		goto err_alloc_chrdev;
2161 
2162 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2163 
2164 #ifdef CONFIG_VFIO_NOIOMMU
2165 	vfio_register_iommu_driver(&vfio_noiommu_ops);
2166 #endif
2167 	return 0;
2168 
2169 err_alloc_chrdev:
2170 	class_destroy(vfio.class);
2171 	vfio.class = NULL;
2172 err_class:
2173 	misc_deregister(&vfio_dev);
2174 	return ret;
2175 }
2176 
vfio_cleanup(void)2177 static void __exit vfio_cleanup(void)
2178 {
2179 	WARN_ON(!list_empty(&vfio.group_list));
2180 
2181 #ifdef CONFIG_VFIO_NOIOMMU
2182 	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2183 #endif
2184 	ida_destroy(&vfio.group_ida);
2185 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2186 	class_destroy(vfio.class);
2187 	vfio.class = NULL;
2188 	misc_deregister(&vfio_dev);
2189 	xa_destroy(&vfio_device_set_xa);
2190 }
2191 
2192 module_init(vfio_init);
2193 module_exit(vfio_cleanup);
2194 
2195 MODULE_VERSION(DRIVER_VERSION);
2196 MODULE_LICENSE("GPL v2");
2197 MODULE_AUTHOR(DRIVER_AUTHOR);
2198 MODULE_DESCRIPTION(DRIVER_DESC);
2199 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2200 MODULE_ALIAS("devname:vfio/vfio");
2201 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2202