1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
18 #include <linux/fs.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
35 #include "vfio.h"
36 
37 #define DRIVER_VERSION	"0.3"
38 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
39 #define DRIVER_DESC	"VFIO - User Level meta-driver"
40 
41 static struct vfio {
42 	struct class			*class;
43 	struct list_head		iommu_drivers_list;
44 	struct mutex			iommu_drivers_lock;
45 	struct list_head		group_list;
46 	struct mutex			group_lock; /* locks group_list */
47 	struct ida			group_ida;
48 	dev_t				group_devt;
49 } vfio;
50 
51 struct vfio_iommu_driver {
52 	const struct vfio_iommu_driver_ops	*ops;
53 	struct list_head			vfio_next;
54 };
55 
56 struct vfio_container {
57 	struct kref			kref;
58 	struct list_head		group_list;
59 	struct rw_semaphore		group_lock;
60 	struct vfio_iommu_driver	*iommu_driver;
61 	void				*iommu_data;
62 	bool				noiommu;
63 };
64 
65 struct vfio_unbound_dev {
66 	struct device			*dev;
67 	struct list_head		unbound_next;
68 };
69 
70 struct vfio_group {
71 	struct device 			dev;
72 	struct cdev			cdev;
73 	refcount_t			users;
74 	atomic_t			container_users;
75 	struct iommu_group		*iommu_group;
76 	struct vfio_container		*container;
77 	struct list_head		device_list;
78 	struct mutex			device_lock;
79 	struct notifier_block		nb;
80 	struct list_head		vfio_next;
81 	struct list_head		container_next;
82 	struct list_head		unbound_list;
83 	struct mutex			unbound_lock;
84 	atomic_t			opened;
85 	wait_queue_head_t		container_q;
86 	enum vfio_group_type		type;
87 	unsigned int			dev_counter;
88 	struct kvm			*kvm;
89 	struct blocking_notifier_head	notifier;
90 };
91 
92 #ifdef CONFIG_VFIO_NOIOMMU
93 static bool noiommu __read_mostly;
94 module_param_named(enable_unsafe_noiommu_mode,
95 		   noiommu, bool, S_IRUGO | S_IWUSR);
96 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
97 #endif
98 
99 static DEFINE_XARRAY(vfio_device_set_xa);
100 static const struct file_operations vfio_group_fops;
101 
vfio_assign_device_set(struct vfio_device * device,void * set_id)102 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
103 {
104 	unsigned long idx = (unsigned long)set_id;
105 	struct vfio_device_set *new_dev_set;
106 	struct vfio_device_set *dev_set;
107 
108 	if (WARN_ON(!set_id))
109 		return -EINVAL;
110 
111 	/*
112 	 * Atomically acquire a singleton object in the xarray for this set_id
113 	 */
114 	xa_lock(&vfio_device_set_xa);
115 	dev_set = xa_load(&vfio_device_set_xa, idx);
116 	if (dev_set)
117 		goto found_get_ref;
118 	xa_unlock(&vfio_device_set_xa);
119 
120 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
121 	if (!new_dev_set)
122 		return -ENOMEM;
123 	mutex_init(&new_dev_set->lock);
124 	INIT_LIST_HEAD(&new_dev_set->device_list);
125 	new_dev_set->set_id = set_id;
126 
127 	xa_lock(&vfio_device_set_xa);
128 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
129 			       GFP_KERNEL);
130 	if (!dev_set) {
131 		dev_set = new_dev_set;
132 		goto found_get_ref;
133 	}
134 
135 	kfree(new_dev_set);
136 	if (xa_is_err(dev_set)) {
137 		xa_unlock(&vfio_device_set_xa);
138 		return xa_err(dev_set);
139 	}
140 
141 found_get_ref:
142 	dev_set->device_count++;
143 	xa_unlock(&vfio_device_set_xa);
144 	mutex_lock(&dev_set->lock);
145 	device->dev_set = dev_set;
146 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
147 	mutex_unlock(&dev_set->lock);
148 	return 0;
149 }
150 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
151 
vfio_release_device_set(struct vfio_device * device)152 static void vfio_release_device_set(struct vfio_device *device)
153 {
154 	struct vfio_device_set *dev_set = device->dev_set;
155 
156 	if (!dev_set)
157 		return;
158 
159 	mutex_lock(&dev_set->lock);
160 	list_del(&device->dev_set_list);
161 	mutex_unlock(&dev_set->lock);
162 
163 	xa_lock(&vfio_device_set_xa);
164 	if (!--dev_set->device_count) {
165 		__xa_erase(&vfio_device_set_xa,
166 			   (unsigned long)dev_set->set_id);
167 		mutex_destroy(&dev_set->lock);
168 		kfree(dev_set);
169 	}
170 	xa_unlock(&vfio_device_set_xa);
171 }
172 
173 #ifdef CONFIG_VFIO_NOIOMMU
vfio_noiommu_open(unsigned long arg)174 static void *vfio_noiommu_open(unsigned long arg)
175 {
176 	if (arg != VFIO_NOIOMMU_IOMMU)
177 		return ERR_PTR(-EINVAL);
178 	if (!capable(CAP_SYS_RAWIO))
179 		return ERR_PTR(-EPERM);
180 
181 	return NULL;
182 }
183 
vfio_noiommu_release(void * iommu_data)184 static void vfio_noiommu_release(void *iommu_data)
185 {
186 }
187 
vfio_noiommu_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)188 static long vfio_noiommu_ioctl(void *iommu_data,
189 			       unsigned int cmd, unsigned long arg)
190 {
191 	if (cmd == VFIO_CHECK_EXTENSION)
192 		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
193 
194 	return -ENOTTY;
195 }
196 
vfio_noiommu_attach_group(void * iommu_data,struct iommu_group * iommu_group,enum vfio_group_type type)197 static int vfio_noiommu_attach_group(void *iommu_data,
198 		struct iommu_group *iommu_group, enum vfio_group_type type)
199 {
200 	return 0;
201 }
202 
vfio_noiommu_detach_group(void * iommu_data,struct iommu_group * iommu_group)203 static void vfio_noiommu_detach_group(void *iommu_data,
204 				      struct iommu_group *iommu_group)
205 {
206 }
207 
208 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
209 	.name = "vfio-noiommu",
210 	.owner = THIS_MODULE,
211 	.open = vfio_noiommu_open,
212 	.release = vfio_noiommu_release,
213 	.ioctl = vfio_noiommu_ioctl,
214 	.attach_group = vfio_noiommu_attach_group,
215 	.detach_group = vfio_noiommu_detach_group,
216 };
217 
218 /*
219  * Only noiommu containers can use vfio-noiommu and noiommu containers can only
220  * use vfio-noiommu.
221  */
vfio_iommu_driver_allowed(struct vfio_container * container,const struct vfio_iommu_driver * driver)222 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
223 		const struct vfio_iommu_driver *driver)
224 {
225 	return container->noiommu == (driver->ops == &vfio_noiommu_ops);
226 }
227 #else
vfio_iommu_driver_allowed(struct vfio_container * container,const struct vfio_iommu_driver * driver)228 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
229 		const struct vfio_iommu_driver *driver)
230 {
231 	return true;
232 }
233 #endif /* CONFIG_VFIO_NOIOMMU */
234 
235 /*
236  * IOMMU driver registration
237  */
vfio_register_iommu_driver(const struct vfio_iommu_driver_ops * ops)238 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
239 {
240 	struct vfio_iommu_driver *driver, *tmp;
241 
242 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
243 	if (!driver)
244 		return -ENOMEM;
245 
246 	driver->ops = ops;
247 
248 	mutex_lock(&vfio.iommu_drivers_lock);
249 
250 	/* Check for duplicates */
251 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
252 		if (tmp->ops == ops) {
253 			mutex_unlock(&vfio.iommu_drivers_lock);
254 			kfree(driver);
255 			return -EINVAL;
256 		}
257 	}
258 
259 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
260 
261 	mutex_unlock(&vfio.iommu_drivers_lock);
262 
263 	return 0;
264 }
265 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
266 
vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops * ops)267 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
268 {
269 	struct vfio_iommu_driver *driver;
270 
271 	mutex_lock(&vfio.iommu_drivers_lock);
272 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
273 		if (driver->ops == ops) {
274 			list_del(&driver->vfio_next);
275 			mutex_unlock(&vfio.iommu_drivers_lock);
276 			kfree(driver);
277 			return;
278 		}
279 	}
280 	mutex_unlock(&vfio.iommu_drivers_lock);
281 }
282 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
283 
284 static int vfio_iommu_group_notifier(struct notifier_block *nb,
285 				     unsigned long action, void *data);
286 static void vfio_group_get(struct vfio_group *group);
287 
288 /*
289  * Container objects - containers are created when /dev/vfio/vfio is
290  * opened, but their lifecycle extends until the last user is done, so
291  * it's freed via kref.  Must support container/group/device being
292  * closed in any order.
293  */
vfio_container_get(struct vfio_container * container)294 static void vfio_container_get(struct vfio_container *container)
295 {
296 	kref_get(&container->kref);
297 }
298 
vfio_container_release(struct kref * kref)299 static void vfio_container_release(struct kref *kref)
300 {
301 	struct vfio_container *container;
302 	container = container_of(kref, struct vfio_container, kref);
303 
304 	kfree(container);
305 }
306 
vfio_container_put(struct vfio_container * container)307 static void vfio_container_put(struct vfio_container *container)
308 {
309 	kref_put(&container->kref, vfio_container_release);
310 }
311 
312 /*
313  * Group objects - create, release, get, put, search
314  */
315 static struct vfio_group *
__vfio_group_get_from_iommu(struct iommu_group * iommu_group)316 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
317 {
318 	struct vfio_group *group;
319 
320 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
321 		if (group->iommu_group == iommu_group) {
322 			vfio_group_get(group);
323 			return group;
324 		}
325 	}
326 	return NULL;
327 }
328 
329 static struct vfio_group *
vfio_group_get_from_iommu(struct iommu_group * iommu_group)330 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
331 {
332 	struct vfio_group *group;
333 
334 	mutex_lock(&vfio.group_lock);
335 	group = __vfio_group_get_from_iommu(iommu_group);
336 	mutex_unlock(&vfio.group_lock);
337 	return group;
338 }
339 
vfio_group_release(struct device * dev)340 static void vfio_group_release(struct device *dev)
341 {
342 	struct vfio_group *group = container_of(dev, struct vfio_group, dev);
343 	struct vfio_unbound_dev *unbound, *tmp;
344 
345 	list_for_each_entry_safe(unbound, tmp,
346 				 &group->unbound_list, unbound_next) {
347 		list_del(&unbound->unbound_next);
348 		kfree(unbound);
349 	}
350 
351 	mutex_destroy(&group->device_lock);
352 	mutex_destroy(&group->unbound_lock);
353 	iommu_group_put(group->iommu_group);
354 	ida_free(&vfio.group_ida, MINOR(group->dev.devt));
355 	kfree(group);
356 }
357 
vfio_group_alloc(struct iommu_group * iommu_group,enum vfio_group_type type)358 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
359 					   enum vfio_group_type type)
360 {
361 	struct vfio_group *group;
362 	int minor;
363 
364 	group = kzalloc(sizeof(*group), GFP_KERNEL);
365 	if (!group)
366 		return ERR_PTR(-ENOMEM);
367 
368 	minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
369 	if (minor < 0) {
370 		kfree(group);
371 		return ERR_PTR(minor);
372 	}
373 
374 	device_initialize(&group->dev);
375 	group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
376 	group->dev.class = vfio.class;
377 	group->dev.release = vfio_group_release;
378 	cdev_init(&group->cdev, &vfio_group_fops);
379 	group->cdev.owner = THIS_MODULE;
380 
381 	refcount_set(&group->users, 1);
382 	INIT_LIST_HEAD(&group->device_list);
383 	mutex_init(&group->device_lock);
384 	INIT_LIST_HEAD(&group->unbound_list);
385 	mutex_init(&group->unbound_lock);
386 	init_waitqueue_head(&group->container_q);
387 	group->iommu_group = iommu_group;
388 	/* put in vfio_group_release() */
389 	iommu_group_ref_get(iommu_group);
390 	group->type = type;
391 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
392 
393 	return group;
394 }
395 
vfio_create_group(struct iommu_group * iommu_group,enum vfio_group_type type)396 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
397 		enum vfio_group_type type)
398 {
399 	struct vfio_group *group;
400 	struct vfio_group *ret;
401 	int err;
402 
403 	group = vfio_group_alloc(iommu_group, type);
404 	if (IS_ERR(group))
405 		return group;
406 
407 	err = dev_set_name(&group->dev, "%s%d",
408 			   group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
409 			   iommu_group_id(iommu_group));
410 	if (err) {
411 		ret = ERR_PTR(err);
412 		goto err_put;
413 	}
414 
415 	group->nb.notifier_call = vfio_iommu_group_notifier;
416 	err = iommu_group_register_notifier(iommu_group, &group->nb);
417 	if (err) {
418 		ret = ERR_PTR(err);
419 		goto err_put;
420 	}
421 
422 	mutex_lock(&vfio.group_lock);
423 
424 	/* Did we race creating this group? */
425 	ret = __vfio_group_get_from_iommu(iommu_group);
426 	if (ret)
427 		goto err_unlock;
428 
429 	err = cdev_device_add(&group->cdev, &group->dev);
430 	if (err) {
431 		ret = ERR_PTR(err);
432 		goto err_unlock;
433 	}
434 
435 	list_add(&group->vfio_next, &vfio.group_list);
436 
437 	mutex_unlock(&vfio.group_lock);
438 	return group;
439 
440 err_unlock:
441 	mutex_unlock(&vfio.group_lock);
442 	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
443 err_put:
444 	put_device(&group->dev);
445 	return ret;
446 }
447 
vfio_group_put(struct vfio_group * group)448 static void vfio_group_put(struct vfio_group *group)
449 {
450 	if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
451 		return;
452 
453 	/*
454 	 * These data structures all have paired operations that can only be
455 	 * undone when the caller holds a live reference on the group. Since all
456 	 * pairs must be undone these WARN_ON's indicate some caller did not
457 	 * properly hold the group reference.
458 	 */
459 	WARN_ON(!list_empty(&group->device_list));
460 	WARN_ON(atomic_read(&group->container_users));
461 	WARN_ON(group->notifier.head);
462 
463 	list_del(&group->vfio_next);
464 	cdev_device_del(&group->cdev, &group->dev);
465 	mutex_unlock(&vfio.group_lock);
466 
467 	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
468 	put_device(&group->dev);
469 }
470 
vfio_group_get(struct vfio_group * group)471 static void vfio_group_get(struct vfio_group *group)
472 {
473 	refcount_inc(&group->users);
474 }
475 
vfio_group_get_from_dev(struct device * dev)476 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
477 {
478 	struct iommu_group *iommu_group;
479 	struct vfio_group *group;
480 
481 	iommu_group = iommu_group_get(dev);
482 	if (!iommu_group)
483 		return NULL;
484 
485 	group = vfio_group_get_from_iommu(iommu_group);
486 	iommu_group_put(iommu_group);
487 
488 	return group;
489 }
490 
491 /*
492  * Device objects - create, release, get, put, search
493  */
494 /* Device reference always implies a group reference */
vfio_device_put(struct vfio_device * device)495 void vfio_device_put(struct vfio_device *device)
496 {
497 	if (refcount_dec_and_test(&device->refcount))
498 		complete(&device->comp);
499 }
500 EXPORT_SYMBOL_GPL(vfio_device_put);
501 
vfio_device_try_get(struct vfio_device * device)502 static bool vfio_device_try_get(struct vfio_device *device)
503 {
504 	return refcount_inc_not_zero(&device->refcount);
505 }
506 
vfio_group_get_device(struct vfio_group * group,struct device * dev)507 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
508 						 struct device *dev)
509 {
510 	struct vfio_device *device;
511 
512 	mutex_lock(&group->device_lock);
513 	list_for_each_entry(device, &group->device_list, group_next) {
514 		if (device->dev == dev && vfio_device_try_get(device)) {
515 			mutex_unlock(&group->device_lock);
516 			return device;
517 		}
518 	}
519 	mutex_unlock(&group->device_lock);
520 	return NULL;
521 }
522 
523 /*
524  * Some drivers, like pci-stub, are only used to prevent other drivers from
525  * claiming a device and are therefore perfectly legitimate for a user owned
526  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
527  * of the device, but it does prevent the user from having direct access to
528  * the device, which is useful in some circumstances.
529  *
530  * We also assume that we can include PCI interconnect devices, ie. bridges.
531  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
532  * then all of the downstream devices will be part of the same IOMMU group as
533  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
534  * breaks anything, it only does so for user owned devices downstream.  Note
535  * that error notification via MSI can be affected for platforms that handle
536  * MSI within the same IOVA space as DMA.
537  */
538 static const char * const vfio_driver_allowed[] = { "pci-stub" };
539 
vfio_dev_driver_allowed(struct device * dev,struct device_driver * drv)540 static bool vfio_dev_driver_allowed(struct device *dev,
541 				    struct device_driver *drv)
542 {
543 	if (dev_is_pci(dev)) {
544 		struct pci_dev *pdev = to_pci_dev(dev);
545 
546 		if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
547 			return true;
548 	}
549 
550 	return match_string(vfio_driver_allowed,
551 			    ARRAY_SIZE(vfio_driver_allowed),
552 			    drv->name) >= 0;
553 }
554 
555 /*
556  * A vfio group is viable for use by userspace if all devices are in
557  * one of the following states:
558  *  - driver-less
559  *  - bound to a vfio driver
560  *  - bound to an otherwise allowed driver
561  *  - a PCI interconnect device
562  *
563  * We use two methods to determine whether a device is bound to a vfio
564  * driver.  The first is to test whether the device exists in the vfio
565  * group.  The second is to test if the device exists on the group
566  * unbound_list, indicating it's in the middle of transitioning from
567  * a vfio driver to driver-less.
568  */
vfio_dev_viable(struct device * dev,void * data)569 static int vfio_dev_viable(struct device *dev, void *data)
570 {
571 	struct vfio_group *group = data;
572 	struct vfio_device *device;
573 	struct device_driver *drv = READ_ONCE(dev->driver);
574 	struct vfio_unbound_dev *unbound;
575 	int ret = -EINVAL;
576 
577 	mutex_lock(&group->unbound_lock);
578 	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
579 		if (dev == unbound->dev) {
580 			ret = 0;
581 			break;
582 		}
583 	}
584 	mutex_unlock(&group->unbound_lock);
585 
586 	if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
587 		return 0;
588 
589 	device = vfio_group_get_device(group, dev);
590 	if (device) {
591 		vfio_device_put(device);
592 		return 0;
593 	}
594 
595 	return ret;
596 }
597 
598 /*
599  * Async device support
600  */
vfio_group_nb_add_dev(struct vfio_group * group,struct device * dev)601 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
602 {
603 	struct vfio_device *device;
604 
605 	/* Do we already know about it?  We shouldn't */
606 	device = vfio_group_get_device(group, dev);
607 	if (WARN_ON_ONCE(device)) {
608 		vfio_device_put(device);
609 		return 0;
610 	}
611 
612 	/* Nothing to do for idle groups */
613 	if (!atomic_read(&group->container_users))
614 		return 0;
615 
616 	/* TODO Prevent device auto probing */
617 	dev_WARN(dev, "Device added to live group %d!\n",
618 		 iommu_group_id(group->iommu_group));
619 
620 	return 0;
621 }
622 
vfio_group_nb_verify(struct vfio_group * group,struct device * dev)623 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
624 {
625 	/* We don't care what happens when the group isn't in use */
626 	if (!atomic_read(&group->container_users))
627 		return 0;
628 
629 	return vfio_dev_viable(dev, group);
630 }
631 
vfio_iommu_group_notifier(struct notifier_block * nb,unsigned long action,void * data)632 static int vfio_iommu_group_notifier(struct notifier_block *nb,
633 				     unsigned long action, void *data)
634 {
635 	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
636 	struct device *dev = data;
637 	struct vfio_unbound_dev *unbound;
638 
639 	switch (action) {
640 	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
641 		vfio_group_nb_add_dev(group, dev);
642 		break;
643 	case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
644 		/*
645 		 * Nothing to do here.  If the device is in use, then the
646 		 * vfio sub-driver should block the remove callback until
647 		 * it is unused.  If the device is unused or attached to a
648 		 * stub driver, then it should be released and we don't
649 		 * care that it will be going away.
650 		 */
651 		break;
652 	case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
653 		dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
654 			iommu_group_id(group->iommu_group));
655 		break;
656 	case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
657 		dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
658 			iommu_group_id(group->iommu_group), dev->driver->name);
659 		BUG_ON(vfio_group_nb_verify(group, dev));
660 		break;
661 	case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
662 		dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
663 			__func__, iommu_group_id(group->iommu_group),
664 			dev->driver->name);
665 		break;
666 	case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
667 		dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
668 			iommu_group_id(group->iommu_group));
669 		/*
670 		 * XXX An unbound device in a live group is ok, but we'd
671 		 * really like to avoid the above BUG_ON by preventing other
672 		 * drivers from binding to it.  Once that occurs, we have to
673 		 * stop the system to maintain isolation.  At a minimum, we'd
674 		 * want a toggle to disable driver auto probe for this device.
675 		 */
676 
677 		mutex_lock(&group->unbound_lock);
678 		list_for_each_entry(unbound,
679 				    &group->unbound_list, unbound_next) {
680 			if (dev == unbound->dev) {
681 				list_del(&unbound->unbound_next);
682 				kfree(unbound);
683 				break;
684 			}
685 		}
686 		mutex_unlock(&group->unbound_lock);
687 		break;
688 	}
689 	return NOTIFY_OK;
690 }
691 
692 /*
693  * VFIO driver API
694  */
vfio_init_group_dev(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)695 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
696 			 const struct vfio_device_ops *ops)
697 {
698 	init_completion(&device->comp);
699 	device->dev = dev;
700 	device->ops = ops;
701 }
702 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
703 
vfio_uninit_group_dev(struct vfio_device * device)704 void vfio_uninit_group_dev(struct vfio_device *device)
705 {
706 	vfio_release_device_set(device);
707 }
708 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
709 
vfio_noiommu_group_alloc(struct device * dev,enum vfio_group_type type)710 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
711 		enum vfio_group_type type)
712 {
713 	struct iommu_group *iommu_group;
714 	struct vfio_group *group;
715 	int ret;
716 
717 	iommu_group = iommu_group_alloc();
718 	if (IS_ERR(iommu_group))
719 		return ERR_CAST(iommu_group);
720 
721 	iommu_group_set_name(iommu_group, "vfio-noiommu");
722 	ret = iommu_group_add_device(iommu_group, dev);
723 	if (ret)
724 		goto out_put_group;
725 
726 	group = vfio_create_group(iommu_group, type);
727 	if (IS_ERR(group)) {
728 		ret = PTR_ERR(group);
729 		goto out_remove_device;
730 	}
731 	iommu_group_put(iommu_group);
732 	return group;
733 
734 out_remove_device:
735 	iommu_group_remove_device(dev);
736 out_put_group:
737 	iommu_group_put(iommu_group);
738 	return ERR_PTR(ret);
739 }
740 
vfio_group_find_or_alloc(struct device * dev)741 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
742 {
743 	struct iommu_group *iommu_group;
744 	struct vfio_group *group;
745 
746 	iommu_group = iommu_group_get(dev);
747 #ifdef CONFIG_VFIO_NOIOMMU
748 	if (!iommu_group && noiommu && !iommu_present(dev->bus)) {
749 		/*
750 		 * With noiommu enabled, create an IOMMU group for devices that
751 		 * don't already have one and don't have an iommu_ops on their
752 		 * bus.  Taint the kernel because we're about to give a DMA
753 		 * capable device to a user without IOMMU protection.
754 		 */
755 		group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
756 		if (!IS_ERR(group)) {
757 			add_taint(TAINT_USER, LOCKDEP_STILL_OK);
758 			dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
759 		}
760 		return group;
761 	}
762 #endif
763 	if (!iommu_group)
764 		return ERR_PTR(-EINVAL);
765 
766 	group = vfio_group_get_from_iommu(iommu_group);
767 	if (!group)
768 		group = vfio_create_group(iommu_group, VFIO_IOMMU);
769 
770 	/* The vfio_group holds a reference to the iommu_group */
771 	iommu_group_put(iommu_group);
772 	return group;
773 }
774 
__vfio_register_dev(struct vfio_device * device,struct vfio_group * group)775 static int __vfio_register_dev(struct vfio_device *device,
776 		struct vfio_group *group)
777 {
778 	struct vfio_device *existing_device;
779 
780 	if (IS_ERR(group))
781 		return PTR_ERR(group);
782 
783 	/*
784 	 * If the driver doesn't specify a set then the device is added to a
785 	 * singleton set just for itself.
786 	 */
787 	if (!device->dev_set)
788 		vfio_assign_device_set(device, device);
789 
790 	existing_device = vfio_group_get_device(group, device->dev);
791 	if (existing_device) {
792 		dev_WARN(device->dev, "Device already exists on group %d\n",
793 			 iommu_group_id(group->iommu_group));
794 		vfio_device_put(existing_device);
795 		if (group->type == VFIO_NO_IOMMU ||
796 		    group->type == VFIO_EMULATED_IOMMU)
797 			iommu_group_remove_device(device->dev);
798 		vfio_group_put(group);
799 		return -EBUSY;
800 	}
801 
802 	/* Our reference on group is moved to the device */
803 	device->group = group;
804 
805 	/* Refcounting can't start until the driver calls register */
806 	refcount_set(&device->refcount, 1);
807 
808 	mutex_lock(&group->device_lock);
809 	list_add(&device->group_next, &group->device_list);
810 	group->dev_counter++;
811 	mutex_unlock(&group->device_lock);
812 
813 	return 0;
814 }
815 
vfio_register_group_dev(struct vfio_device * device)816 int vfio_register_group_dev(struct vfio_device *device)
817 {
818 	return __vfio_register_dev(device,
819 		vfio_group_find_or_alloc(device->dev));
820 }
821 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
822 
823 /*
824  * Register a virtual device without IOMMU backing.  The user of this
825  * device must not be able to directly trigger unmediated DMA.
826  */
vfio_register_emulated_iommu_dev(struct vfio_device * device)827 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
828 {
829 	return __vfio_register_dev(device,
830 		vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
831 }
832 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
833 
834 /*
835  * Get a reference to the vfio_device for a device.  Even if the
836  * caller thinks they own the device, they could be racing with a
837  * release call path, so we can't trust drvdata for the shortcut.
838  * Go the long way around, from the iommu_group to the vfio_group
839  * to the vfio_device.
840  */
vfio_device_get_from_dev(struct device * dev)841 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
842 {
843 	struct vfio_group *group;
844 	struct vfio_device *device;
845 
846 	group = vfio_group_get_from_dev(dev);
847 	if (!group)
848 		return NULL;
849 
850 	device = vfio_group_get_device(group, dev);
851 	vfio_group_put(group);
852 
853 	return device;
854 }
855 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
856 
vfio_device_get_from_name(struct vfio_group * group,char * buf)857 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
858 						     char *buf)
859 {
860 	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
861 
862 	mutex_lock(&group->device_lock);
863 	list_for_each_entry(it, &group->device_list, group_next) {
864 		int ret;
865 
866 		if (it->ops->match) {
867 			ret = it->ops->match(it, buf);
868 			if (ret < 0) {
869 				device = ERR_PTR(ret);
870 				break;
871 			}
872 		} else {
873 			ret = !strcmp(dev_name(it->dev), buf);
874 		}
875 
876 		if (ret && vfio_device_try_get(it)) {
877 			device = it;
878 			break;
879 		}
880 	}
881 	mutex_unlock(&group->device_lock);
882 
883 	return device;
884 }
885 
886 /*
887  * Decrement the device reference count and wait for the device to be
888  * removed.  Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)889 void vfio_unregister_group_dev(struct vfio_device *device)
890 {
891 	struct vfio_group *group = device->group;
892 	struct vfio_unbound_dev *unbound;
893 	unsigned int i = 0;
894 	bool interrupted = false;
895 	long rc;
896 
897 	/*
898 	 * When the device is removed from the group, the group suddenly
899 	 * becomes non-viable; the device has a driver (until the unbind
900 	 * completes), but it's not present in the group.  This is bad news
901 	 * for any external users that need to re-acquire a group reference
902 	 * in order to match and release their existing reference.  To
903 	 * solve this, we track such devices on the unbound_list to bridge
904 	 * the gap until they're fully unbound.
905 	 */
906 	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
907 	if (unbound) {
908 		unbound->dev = device->dev;
909 		mutex_lock(&group->unbound_lock);
910 		list_add(&unbound->unbound_next, &group->unbound_list);
911 		mutex_unlock(&group->unbound_lock);
912 	}
913 	WARN_ON(!unbound);
914 
915 	vfio_device_put(device);
916 	rc = try_wait_for_completion(&device->comp);
917 	while (rc <= 0) {
918 		if (device->ops->request)
919 			device->ops->request(device, i++);
920 
921 		if (interrupted) {
922 			rc = wait_for_completion_timeout(&device->comp,
923 							 HZ * 10);
924 		} else {
925 			rc = wait_for_completion_interruptible_timeout(
926 				&device->comp, HZ * 10);
927 			if (rc < 0) {
928 				interrupted = true;
929 				dev_warn(device->dev,
930 					 "Device is currently in use, task"
931 					 " \"%s\" (%d) "
932 					 "blocked until device is released",
933 					 current->comm, task_pid_nr(current));
934 			}
935 		}
936 	}
937 
938 	mutex_lock(&group->device_lock);
939 	list_del(&device->group_next);
940 	group->dev_counter--;
941 	mutex_unlock(&group->device_lock);
942 
943 	/*
944 	 * In order to support multiple devices per group, devices can be
945 	 * plucked from the group while other devices in the group are still
946 	 * in use.  The container persists with this group and those remaining
947 	 * devices still attached.  If the user creates an isolation violation
948 	 * by binding this device to another driver while the group is still in
949 	 * use, that's their fault.  However, in the case of removing the last,
950 	 * or potentially the only, device in the group there can be no other
951 	 * in-use devices in the group.  The user has done their due diligence
952 	 * and we should lay no claims to those devices.  In order to do that,
953 	 * we need to make sure the group is detached from the container.
954 	 * Without this stall, we're potentially racing with a user process
955 	 * that may attempt to immediately bind this device to another driver.
956 	 */
957 	if (list_empty(&group->device_list))
958 		wait_event(group->container_q, !group->container);
959 
960 	if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
961 		iommu_group_remove_device(device->dev);
962 
963 	/* Matches the get in vfio_register_group_dev() */
964 	vfio_group_put(group);
965 }
966 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
967 
968 /*
969  * VFIO base fd, /dev/vfio/vfio
970  */
vfio_ioctl_check_extension(struct vfio_container * container,unsigned long arg)971 static long vfio_ioctl_check_extension(struct vfio_container *container,
972 				       unsigned long arg)
973 {
974 	struct vfio_iommu_driver *driver;
975 	long ret = 0;
976 
977 	down_read(&container->group_lock);
978 
979 	driver = container->iommu_driver;
980 
981 	switch (arg) {
982 		/* No base extensions yet */
983 	default:
984 		/*
985 		 * If no driver is set, poll all registered drivers for
986 		 * extensions and return the first positive result.  If
987 		 * a driver is already set, further queries will be passed
988 		 * only to that driver.
989 		 */
990 		if (!driver) {
991 			mutex_lock(&vfio.iommu_drivers_lock);
992 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
993 					    vfio_next) {
994 
995 				if (!list_empty(&container->group_list) &&
996 				    !vfio_iommu_driver_allowed(container,
997 							       driver))
998 					continue;
999 				if (!try_module_get(driver->ops->owner))
1000 					continue;
1001 
1002 				ret = driver->ops->ioctl(NULL,
1003 							 VFIO_CHECK_EXTENSION,
1004 							 arg);
1005 				module_put(driver->ops->owner);
1006 				if (ret > 0)
1007 					break;
1008 			}
1009 			mutex_unlock(&vfio.iommu_drivers_lock);
1010 		} else
1011 			ret = driver->ops->ioctl(container->iommu_data,
1012 						 VFIO_CHECK_EXTENSION, arg);
1013 	}
1014 
1015 	up_read(&container->group_lock);
1016 
1017 	return ret;
1018 }
1019 
1020 /* hold write lock on container->group_lock */
__vfio_container_attach_groups(struct vfio_container * container,struct vfio_iommu_driver * driver,void * data)1021 static int __vfio_container_attach_groups(struct vfio_container *container,
1022 					  struct vfio_iommu_driver *driver,
1023 					  void *data)
1024 {
1025 	struct vfio_group *group;
1026 	int ret = -ENODEV;
1027 
1028 	list_for_each_entry(group, &container->group_list, container_next) {
1029 		ret = driver->ops->attach_group(data, group->iommu_group,
1030 						group->type);
1031 		if (ret)
1032 			goto unwind;
1033 	}
1034 
1035 	return ret;
1036 
1037 unwind:
1038 	list_for_each_entry_continue_reverse(group, &container->group_list,
1039 					     container_next) {
1040 		driver->ops->detach_group(data, group->iommu_group);
1041 	}
1042 
1043 	return ret;
1044 }
1045 
vfio_ioctl_set_iommu(struct vfio_container * container,unsigned long arg)1046 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1047 				 unsigned long arg)
1048 {
1049 	struct vfio_iommu_driver *driver;
1050 	long ret = -ENODEV;
1051 
1052 	down_write(&container->group_lock);
1053 
1054 	/*
1055 	 * The container is designed to be an unprivileged interface while
1056 	 * the group can be assigned to specific users.  Therefore, only by
1057 	 * adding a group to a container does the user get the privilege of
1058 	 * enabling the iommu, which may allocate finite resources.  There
1059 	 * is no unset_iommu, but by removing all the groups from a container,
1060 	 * the container is deprivileged and returns to an unset state.
1061 	 */
1062 	if (list_empty(&container->group_list) || container->iommu_driver) {
1063 		up_write(&container->group_lock);
1064 		return -EINVAL;
1065 	}
1066 
1067 	mutex_lock(&vfio.iommu_drivers_lock);
1068 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1069 		void *data;
1070 
1071 		if (!vfio_iommu_driver_allowed(container, driver))
1072 			continue;
1073 		if (!try_module_get(driver->ops->owner))
1074 			continue;
1075 
1076 		/*
1077 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1078 		 * so test which iommu driver reported support for this
1079 		 * extension and call open on them.  We also pass them the
1080 		 * magic, allowing a single driver to support multiple
1081 		 * interfaces if they'd like.
1082 		 */
1083 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1084 			module_put(driver->ops->owner);
1085 			continue;
1086 		}
1087 
1088 		data = driver->ops->open(arg);
1089 		if (IS_ERR(data)) {
1090 			ret = PTR_ERR(data);
1091 			module_put(driver->ops->owner);
1092 			continue;
1093 		}
1094 
1095 		ret = __vfio_container_attach_groups(container, driver, data);
1096 		if (ret) {
1097 			driver->ops->release(data);
1098 			module_put(driver->ops->owner);
1099 			continue;
1100 		}
1101 
1102 		container->iommu_driver = driver;
1103 		container->iommu_data = data;
1104 		break;
1105 	}
1106 
1107 	mutex_unlock(&vfio.iommu_drivers_lock);
1108 	up_write(&container->group_lock);
1109 
1110 	return ret;
1111 }
1112 
vfio_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1113 static long vfio_fops_unl_ioctl(struct file *filep,
1114 				unsigned int cmd, unsigned long arg)
1115 {
1116 	struct vfio_container *container = filep->private_data;
1117 	struct vfio_iommu_driver *driver;
1118 	void *data;
1119 	long ret = -EINVAL;
1120 
1121 	if (!container)
1122 		return ret;
1123 
1124 	switch (cmd) {
1125 	case VFIO_GET_API_VERSION:
1126 		ret = VFIO_API_VERSION;
1127 		break;
1128 	case VFIO_CHECK_EXTENSION:
1129 		ret = vfio_ioctl_check_extension(container, arg);
1130 		break;
1131 	case VFIO_SET_IOMMU:
1132 		ret = vfio_ioctl_set_iommu(container, arg);
1133 		break;
1134 	default:
1135 		driver = container->iommu_driver;
1136 		data = container->iommu_data;
1137 
1138 		if (driver) /* passthrough all unrecognized ioctls */
1139 			ret = driver->ops->ioctl(data, cmd, arg);
1140 	}
1141 
1142 	return ret;
1143 }
1144 
vfio_fops_open(struct inode * inode,struct file * filep)1145 static int vfio_fops_open(struct inode *inode, struct file *filep)
1146 {
1147 	struct vfio_container *container;
1148 
1149 	container = kzalloc(sizeof(*container), GFP_KERNEL);
1150 	if (!container)
1151 		return -ENOMEM;
1152 
1153 	INIT_LIST_HEAD(&container->group_list);
1154 	init_rwsem(&container->group_lock);
1155 	kref_init(&container->kref);
1156 
1157 	filep->private_data = container;
1158 
1159 	return 0;
1160 }
1161 
vfio_fops_release(struct inode * inode,struct file * filep)1162 static int vfio_fops_release(struct inode *inode, struct file *filep)
1163 {
1164 	struct vfio_container *container = filep->private_data;
1165 	struct vfio_iommu_driver *driver = container->iommu_driver;
1166 
1167 	if (driver && driver->ops->notify)
1168 		driver->ops->notify(container->iommu_data,
1169 				    VFIO_IOMMU_CONTAINER_CLOSE);
1170 
1171 	filep->private_data = NULL;
1172 
1173 	vfio_container_put(container);
1174 
1175 	return 0;
1176 }
1177 
1178 static const struct file_operations vfio_fops = {
1179 	.owner		= THIS_MODULE,
1180 	.open		= vfio_fops_open,
1181 	.release	= vfio_fops_release,
1182 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1183 	.compat_ioctl	= compat_ptr_ioctl,
1184 };
1185 
1186 /*
1187  * VFIO Group fd, /dev/vfio/$GROUP
1188  */
__vfio_group_unset_container(struct vfio_group * group)1189 static void __vfio_group_unset_container(struct vfio_group *group)
1190 {
1191 	struct vfio_container *container = group->container;
1192 	struct vfio_iommu_driver *driver;
1193 
1194 	down_write(&container->group_lock);
1195 
1196 	driver = container->iommu_driver;
1197 	if (driver)
1198 		driver->ops->detach_group(container->iommu_data,
1199 					  group->iommu_group);
1200 
1201 	group->container = NULL;
1202 	wake_up(&group->container_q);
1203 	list_del(&group->container_next);
1204 
1205 	/* Detaching the last group deprivileges a container, remove iommu */
1206 	if (driver && list_empty(&container->group_list)) {
1207 		driver->ops->release(container->iommu_data);
1208 		module_put(driver->ops->owner);
1209 		container->iommu_driver = NULL;
1210 		container->iommu_data = NULL;
1211 	}
1212 
1213 	up_write(&container->group_lock);
1214 
1215 	vfio_container_put(container);
1216 }
1217 
1218 /*
1219  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1220  * if there was no container to unset.  Since the ioctl is called on
1221  * the group, we know that still exists, therefore the only valid
1222  * transition here is 1->0.
1223  */
vfio_group_unset_container(struct vfio_group * group)1224 static int vfio_group_unset_container(struct vfio_group *group)
1225 {
1226 	int users = atomic_cmpxchg(&group->container_users, 1, 0);
1227 
1228 	if (!users)
1229 		return -EINVAL;
1230 	if (users != 1)
1231 		return -EBUSY;
1232 
1233 	__vfio_group_unset_container(group);
1234 
1235 	return 0;
1236 }
1237 
1238 /*
1239  * When removing container users, anything that removes the last user
1240  * implicitly removes the group from the container.  That is, if the
1241  * group file descriptor is closed, as well as any device file descriptors,
1242  * the group is free.
1243  */
vfio_group_try_dissolve_container(struct vfio_group * group)1244 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1245 {
1246 	if (0 == atomic_dec_if_positive(&group->container_users))
1247 		__vfio_group_unset_container(group);
1248 }
1249 
vfio_group_set_container(struct vfio_group * group,int container_fd)1250 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1251 {
1252 	struct fd f;
1253 	struct vfio_container *container;
1254 	struct vfio_iommu_driver *driver;
1255 	int ret = 0;
1256 
1257 	if (atomic_read(&group->container_users))
1258 		return -EINVAL;
1259 
1260 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1261 		return -EPERM;
1262 
1263 	f = fdget(container_fd);
1264 	if (!f.file)
1265 		return -EBADF;
1266 
1267 	/* Sanity check, is this really our fd? */
1268 	if (f.file->f_op != &vfio_fops) {
1269 		fdput(f);
1270 		return -EINVAL;
1271 	}
1272 
1273 	container = f.file->private_data;
1274 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1275 
1276 	down_write(&container->group_lock);
1277 
1278 	/* Real groups and fake groups cannot mix */
1279 	if (!list_empty(&container->group_list) &&
1280 	    container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1281 		ret = -EPERM;
1282 		goto unlock_out;
1283 	}
1284 
1285 	driver = container->iommu_driver;
1286 	if (driver) {
1287 		ret = driver->ops->attach_group(container->iommu_data,
1288 						group->iommu_group,
1289 						group->type);
1290 		if (ret)
1291 			goto unlock_out;
1292 	}
1293 
1294 	group->container = container;
1295 	container->noiommu = (group->type == VFIO_NO_IOMMU);
1296 	list_add(&group->container_next, &container->group_list);
1297 
1298 	/* Get a reference on the container and mark a user within the group */
1299 	vfio_container_get(container);
1300 	atomic_inc(&group->container_users);
1301 
1302 unlock_out:
1303 	up_write(&container->group_lock);
1304 	fdput(f);
1305 	return ret;
1306 }
1307 
vfio_group_viable(struct vfio_group * group)1308 static bool vfio_group_viable(struct vfio_group *group)
1309 {
1310 	return (iommu_group_for_each_dev(group->iommu_group,
1311 					 group, vfio_dev_viable) == 0);
1312 }
1313 
vfio_group_add_container_user(struct vfio_group * group)1314 static int vfio_group_add_container_user(struct vfio_group *group)
1315 {
1316 	if (!atomic_inc_not_zero(&group->container_users))
1317 		return -EINVAL;
1318 
1319 	if (group->type == VFIO_NO_IOMMU) {
1320 		atomic_dec(&group->container_users);
1321 		return -EPERM;
1322 	}
1323 	if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1324 		atomic_dec(&group->container_users);
1325 		return -EINVAL;
1326 	}
1327 
1328 	return 0;
1329 }
1330 
1331 static const struct file_operations vfio_device_fops;
1332 
vfio_group_get_device_fd(struct vfio_group * group,char * buf)1333 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1334 {
1335 	struct vfio_device *device;
1336 	struct file *filep;
1337 	int fdno;
1338 	int ret = 0;
1339 
1340 	if (0 == atomic_read(&group->container_users) ||
1341 	    !group->container->iommu_driver || !vfio_group_viable(group))
1342 		return -EINVAL;
1343 
1344 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1345 		return -EPERM;
1346 
1347 	device = vfio_device_get_from_name(group, buf);
1348 	if (IS_ERR(device))
1349 		return PTR_ERR(device);
1350 
1351 	if (!try_module_get(device->dev->driver->owner)) {
1352 		ret = -ENODEV;
1353 		goto err_device_put;
1354 	}
1355 
1356 	mutex_lock(&device->dev_set->lock);
1357 	device->open_count++;
1358 	if (device->open_count == 1 && device->ops->open_device) {
1359 		ret = device->ops->open_device(device);
1360 		if (ret)
1361 			goto err_undo_count;
1362 	}
1363 	mutex_unlock(&device->dev_set->lock);
1364 
1365 	/*
1366 	 * We can't use anon_inode_getfd() because we need to modify
1367 	 * the f_mode flags directly to allow more than just ioctls
1368 	 */
1369 	fdno = ret = get_unused_fd_flags(O_CLOEXEC);
1370 	if (ret < 0)
1371 		goto err_close_device;
1372 
1373 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1374 				   device, O_RDWR);
1375 	if (IS_ERR(filep)) {
1376 		ret = PTR_ERR(filep);
1377 		goto err_fd;
1378 	}
1379 
1380 	/*
1381 	 * TODO: add an anon_inode interface to do this.
1382 	 * Appears to be missing by lack of need rather than
1383 	 * explicitly prevented.  Now there's need.
1384 	 */
1385 	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1386 
1387 	atomic_inc(&group->container_users);
1388 
1389 	fd_install(fdno, filep);
1390 
1391 	if (group->type == VFIO_NO_IOMMU)
1392 		dev_warn(device->dev, "vfio-noiommu device opened by user "
1393 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1394 	return fdno;
1395 
1396 err_fd:
1397 	put_unused_fd(fdno);
1398 err_close_device:
1399 	mutex_lock(&device->dev_set->lock);
1400 	if (device->open_count == 1 && device->ops->close_device)
1401 		device->ops->close_device(device);
1402 err_undo_count:
1403 	device->open_count--;
1404 	mutex_unlock(&device->dev_set->lock);
1405 	module_put(device->dev->driver->owner);
1406 err_device_put:
1407 	vfio_device_put(device);
1408 	return ret;
1409 }
1410 
vfio_group_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1411 static long vfio_group_fops_unl_ioctl(struct file *filep,
1412 				      unsigned int cmd, unsigned long arg)
1413 {
1414 	struct vfio_group *group = filep->private_data;
1415 	long ret = -ENOTTY;
1416 
1417 	switch (cmd) {
1418 	case VFIO_GROUP_GET_STATUS:
1419 	{
1420 		struct vfio_group_status status;
1421 		unsigned long minsz;
1422 
1423 		minsz = offsetofend(struct vfio_group_status, flags);
1424 
1425 		if (copy_from_user(&status, (void __user *)arg, minsz))
1426 			return -EFAULT;
1427 
1428 		if (status.argsz < minsz)
1429 			return -EINVAL;
1430 
1431 		status.flags = 0;
1432 
1433 		if (vfio_group_viable(group))
1434 			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1435 
1436 		if (group->container)
1437 			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1438 
1439 		if (copy_to_user((void __user *)arg, &status, minsz))
1440 			return -EFAULT;
1441 
1442 		ret = 0;
1443 		break;
1444 	}
1445 	case VFIO_GROUP_SET_CONTAINER:
1446 	{
1447 		int fd;
1448 
1449 		if (get_user(fd, (int __user *)arg))
1450 			return -EFAULT;
1451 
1452 		if (fd < 0)
1453 			return -EINVAL;
1454 
1455 		ret = vfio_group_set_container(group, fd);
1456 		break;
1457 	}
1458 	case VFIO_GROUP_UNSET_CONTAINER:
1459 		ret = vfio_group_unset_container(group);
1460 		break;
1461 	case VFIO_GROUP_GET_DEVICE_FD:
1462 	{
1463 		char *buf;
1464 
1465 		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1466 		if (IS_ERR(buf))
1467 			return PTR_ERR(buf);
1468 
1469 		ret = vfio_group_get_device_fd(group, buf);
1470 		kfree(buf);
1471 		break;
1472 	}
1473 	}
1474 
1475 	return ret;
1476 }
1477 
vfio_group_fops_open(struct inode * inode,struct file * filep)1478 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1479 {
1480 	struct vfio_group *group =
1481 		container_of(inode->i_cdev, struct vfio_group, cdev);
1482 	int opened;
1483 
1484 	/* users can be zero if this races with vfio_group_put() */
1485 	if (!refcount_inc_not_zero(&group->users))
1486 		return -ENODEV;
1487 
1488 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1489 		vfio_group_put(group);
1490 		return -EPERM;
1491 	}
1492 
1493 	/* Do we need multiple instances of the group open?  Seems not. */
1494 	opened = atomic_cmpxchg(&group->opened, 0, 1);
1495 	if (opened) {
1496 		vfio_group_put(group);
1497 		return -EBUSY;
1498 	}
1499 
1500 	/* Is something still in use from a previous open? */
1501 	if (group->container) {
1502 		atomic_dec(&group->opened);
1503 		vfio_group_put(group);
1504 		return -EBUSY;
1505 	}
1506 
1507 	/* Warn if previous user didn't cleanup and re-init to drop them */
1508 	if (WARN_ON(group->notifier.head))
1509 		BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1510 
1511 	filep->private_data = group;
1512 
1513 	return 0;
1514 }
1515 
vfio_group_fops_release(struct inode * inode,struct file * filep)1516 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1517 {
1518 	struct vfio_group *group = filep->private_data;
1519 
1520 	filep->private_data = NULL;
1521 
1522 	vfio_group_try_dissolve_container(group);
1523 
1524 	atomic_dec(&group->opened);
1525 
1526 	vfio_group_put(group);
1527 
1528 	return 0;
1529 }
1530 
1531 static const struct file_operations vfio_group_fops = {
1532 	.owner		= THIS_MODULE,
1533 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1534 	.compat_ioctl	= compat_ptr_ioctl,
1535 	.open		= vfio_group_fops_open,
1536 	.release	= vfio_group_fops_release,
1537 };
1538 
1539 /*
1540  * VFIO Device fd
1541  */
vfio_device_fops_release(struct inode * inode,struct file * filep)1542 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1543 {
1544 	struct vfio_device *device = filep->private_data;
1545 
1546 	mutex_lock(&device->dev_set->lock);
1547 	if (!--device->open_count && device->ops->close_device)
1548 		device->ops->close_device(device);
1549 	mutex_unlock(&device->dev_set->lock);
1550 
1551 	module_put(device->dev->driver->owner);
1552 
1553 	vfio_group_try_dissolve_container(device->group);
1554 
1555 	vfio_device_put(device);
1556 
1557 	return 0;
1558 }
1559 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1560 static long vfio_device_fops_unl_ioctl(struct file *filep,
1561 				       unsigned int cmd, unsigned long arg)
1562 {
1563 	struct vfio_device *device = filep->private_data;
1564 
1565 	if (unlikely(!device->ops->ioctl))
1566 		return -EINVAL;
1567 
1568 	return device->ops->ioctl(device, cmd, arg);
1569 }
1570 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1571 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1572 				     size_t count, loff_t *ppos)
1573 {
1574 	struct vfio_device *device = filep->private_data;
1575 
1576 	if (unlikely(!device->ops->read))
1577 		return -EINVAL;
1578 
1579 	return device->ops->read(device, buf, count, ppos);
1580 }
1581 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1582 static ssize_t vfio_device_fops_write(struct file *filep,
1583 				      const char __user *buf,
1584 				      size_t count, loff_t *ppos)
1585 {
1586 	struct vfio_device *device = filep->private_data;
1587 
1588 	if (unlikely(!device->ops->write))
1589 		return -EINVAL;
1590 
1591 	return device->ops->write(device, buf, count, ppos);
1592 }
1593 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1594 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1595 {
1596 	struct vfio_device *device = filep->private_data;
1597 
1598 	if (unlikely(!device->ops->mmap))
1599 		return -EINVAL;
1600 
1601 	return device->ops->mmap(device, vma);
1602 }
1603 
1604 static const struct file_operations vfio_device_fops = {
1605 	.owner		= THIS_MODULE,
1606 	.release	= vfio_device_fops_release,
1607 	.read		= vfio_device_fops_read,
1608 	.write		= vfio_device_fops_write,
1609 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1610 	.compat_ioctl	= compat_ptr_ioctl,
1611 	.mmap		= vfio_device_fops_mmap,
1612 };
1613 
1614 /*
1615  * External user API, exported by symbols to be linked dynamically.
1616  *
1617  * The protocol includes:
1618  *  1. do normal VFIO init operation:
1619  *	- opening a new container;
1620  *	- attaching group(s) to it;
1621  *	- setting an IOMMU driver for a container.
1622  * When IOMMU is set for a container, all groups in it are
1623  * considered ready to use by an external user.
1624  *
1625  * 2. User space passes a group fd to an external user.
1626  * The external user calls vfio_group_get_external_user()
1627  * to verify that:
1628  *	- the group is initialized;
1629  *	- IOMMU is set for it.
1630  * If both checks passed, vfio_group_get_external_user()
1631  * increments the container user counter to prevent
1632  * the VFIO group from disposal before KVM exits.
1633  *
1634  * 3. The external user calls vfio_external_user_iommu_id()
1635  * to know an IOMMU ID.
1636  *
1637  * 4. When the external KVM finishes, it calls
1638  * vfio_group_put_external_user() to release the VFIO group.
1639  * This call decrements the container user counter.
1640  */
vfio_group_get_external_user(struct file * filep)1641 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1642 {
1643 	struct vfio_group *group = filep->private_data;
1644 	int ret;
1645 
1646 	if (filep->f_op != &vfio_group_fops)
1647 		return ERR_PTR(-EINVAL);
1648 
1649 	ret = vfio_group_add_container_user(group);
1650 	if (ret)
1651 		return ERR_PTR(ret);
1652 
1653 	/*
1654 	 * Since the caller holds the fget on the file group->users must be >= 1
1655 	 */
1656 	vfio_group_get(group);
1657 
1658 	return group;
1659 }
1660 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1661 
1662 /*
1663  * External user API, exported by symbols to be linked dynamically.
1664  * The external user passes in a device pointer
1665  * to verify that:
1666  *	- A VFIO group is assiciated with the device;
1667  *	- IOMMU is set for the group.
1668  * If both checks passed, vfio_group_get_external_user_from_dev()
1669  * increments the container user counter to prevent the VFIO group
1670  * from disposal before external user exits and returns the pointer
1671  * to the VFIO group.
1672  *
1673  * When the external user finishes using the VFIO group, it calls
1674  * vfio_group_put_external_user() to release the VFIO group and
1675  * decrement the container user counter.
1676  *
1677  * @dev [in]	: device
1678  * Return error PTR or pointer to VFIO group.
1679  */
1680 
vfio_group_get_external_user_from_dev(struct device * dev)1681 struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1682 {
1683 	struct vfio_group *group;
1684 	int ret;
1685 
1686 	group = vfio_group_get_from_dev(dev);
1687 	if (!group)
1688 		return ERR_PTR(-ENODEV);
1689 
1690 	ret = vfio_group_add_container_user(group);
1691 	if (ret) {
1692 		vfio_group_put(group);
1693 		return ERR_PTR(ret);
1694 	}
1695 
1696 	return group;
1697 }
1698 EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1699 
vfio_group_put_external_user(struct vfio_group * group)1700 void vfio_group_put_external_user(struct vfio_group *group)
1701 {
1702 	vfio_group_try_dissolve_container(group);
1703 	vfio_group_put(group);
1704 }
1705 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1706 
vfio_external_group_match_file(struct vfio_group * test_group,struct file * filep)1707 bool vfio_external_group_match_file(struct vfio_group *test_group,
1708 				    struct file *filep)
1709 {
1710 	struct vfio_group *group = filep->private_data;
1711 
1712 	return (filep->f_op == &vfio_group_fops) && (group == test_group);
1713 }
1714 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1715 
vfio_external_user_iommu_id(struct vfio_group * group)1716 int vfio_external_user_iommu_id(struct vfio_group *group)
1717 {
1718 	return iommu_group_id(group->iommu_group);
1719 }
1720 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1721 
vfio_external_check_extension(struct vfio_group * group,unsigned long arg)1722 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1723 {
1724 	return vfio_ioctl_check_extension(group->container, arg);
1725 }
1726 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1727 
1728 /*
1729  * Sub-module support
1730  */
1731 /*
1732  * Helper for managing a buffer of info chain capabilities, allocate or
1733  * reallocate a buffer with additional @size, filling in @id and @version
1734  * of the capability.  A pointer to the new capability is returned.
1735  *
1736  * NB. The chain is based at the head of the buffer, so new entries are
1737  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1738  * next offsets prior to copying to the user buffer.
1739  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1740 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1741 					       size_t size, u16 id, u16 version)
1742 {
1743 	void *buf;
1744 	struct vfio_info_cap_header *header, *tmp;
1745 
1746 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1747 	if (!buf) {
1748 		kfree(caps->buf);
1749 		caps->size = 0;
1750 		return ERR_PTR(-ENOMEM);
1751 	}
1752 
1753 	caps->buf = buf;
1754 	header = buf + caps->size;
1755 
1756 	/* Eventually copied to user buffer, zero */
1757 	memset(header, 0, size);
1758 
1759 	header->id = id;
1760 	header->version = version;
1761 
1762 	/* Add to the end of the capability chain */
1763 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1764 		; /* nothing */
1765 
1766 	tmp->next = caps->size;
1767 	caps->size += size;
1768 
1769 	return header;
1770 }
1771 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1772 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1773 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1774 {
1775 	struct vfio_info_cap_header *tmp;
1776 	void *buf = (void *)caps->buf;
1777 
1778 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1779 		tmp->next += offset;
1780 }
1781 EXPORT_SYMBOL(vfio_info_cap_shift);
1782 
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1783 int vfio_info_add_capability(struct vfio_info_cap *caps,
1784 			     struct vfio_info_cap_header *cap, size_t size)
1785 {
1786 	struct vfio_info_cap_header *header;
1787 
1788 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1789 	if (IS_ERR(header))
1790 		return PTR_ERR(header);
1791 
1792 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1793 
1794 	return 0;
1795 }
1796 EXPORT_SYMBOL(vfio_info_add_capability);
1797 
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1798 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1799 				       int max_irq_type, size_t *data_size)
1800 {
1801 	unsigned long minsz;
1802 	size_t size;
1803 
1804 	minsz = offsetofend(struct vfio_irq_set, count);
1805 
1806 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1807 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1808 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1809 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1810 		return -EINVAL;
1811 
1812 	if (data_size)
1813 		*data_size = 0;
1814 
1815 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1816 		return -EINVAL;
1817 
1818 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1819 	case VFIO_IRQ_SET_DATA_NONE:
1820 		size = 0;
1821 		break;
1822 	case VFIO_IRQ_SET_DATA_BOOL:
1823 		size = sizeof(uint8_t);
1824 		break;
1825 	case VFIO_IRQ_SET_DATA_EVENTFD:
1826 		size = sizeof(int32_t);
1827 		break;
1828 	default:
1829 		return -EINVAL;
1830 	}
1831 
1832 	if (size) {
1833 		if (hdr->argsz - minsz < hdr->count * size)
1834 			return -EINVAL;
1835 
1836 		if (!data_size)
1837 			return -EINVAL;
1838 
1839 		*data_size = hdr->count * size;
1840 	}
1841 
1842 	return 0;
1843 }
1844 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1845 
1846 /*
1847  * Pin a set of guest PFNs and return their associated host PFNs for local
1848  * domain only.
1849  * @dev [in]     : device
1850  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1851  * @npage [in]   : count of elements in user_pfn array.  This count should not
1852  *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1853  * @prot [in]    : protection flags
1854  * @phys_pfn[out]: array of host PFNs
1855  * Return error or number of pages pinned.
1856  */
vfio_pin_pages(struct device * dev,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)1857 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1858 		   int prot, unsigned long *phys_pfn)
1859 {
1860 	struct vfio_container *container;
1861 	struct vfio_group *group;
1862 	struct vfio_iommu_driver *driver;
1863 	int ret;
1864 
1865 	if (!dev || !user_pfn || !phys_pfn || !npage)
1866 		return -EINVAL;
1867 
1868 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1869 		return -E2BIG;
1870 
1871 	group = vfio_group_get_from_dev(dev);
1872 	if (!group)
1873 		return -ENODEV;
1874 
1875 	if (group->dev_counter > 1) {
1876 		ret = -EINVAL;
1877 		goto err_pin_pages;
1878 	}
1879 
1880 	ret = vfio_group_add_container_user(group);
1881 	if (ret)
1882 		goto err_pin_pages;
1883 
1884 	container = group->container;
1885 	driver = container->iommu_driver;
1886 	if (likely(driver && driver->ops->pin_pages))
1887 		ret = driver->ops->pin_pages(container->iommu_data,
1888 					     group->iommu_group, user_pfn,
1889 					     npage, prot, phys_pfn);
1890 	else
1891 		ret = -ENOTTY;
1892 
1893 	vfio_group_try_dissolve_container(group);
1894 
1895 err_pin_pages:
1896 	vfio_group_put(group);
1897 	return ret;
1898 }
1899 EXPORT_SYMBOL(vfio_pin_pages);
1900 
1901 /*
1902  * Unpin set of host PFNs for local domain only.
1903  * @dev [in]     : device
1904  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1905  *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1906  * @npage [in]   : count of elements in user_pfn array.  This count should not
1907  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1908  * Return error or number of pages unpinned.
1909  */
vfio_unpin_pages(struct device * dev,unsigned long * user_pfn,int npage)1910 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1911 {
1912 	struct vfio_container *container;
1913 	struct vfio_group *group;
1914 	struct vfio_iommu_driver *driver;
1915 	int ret;
1916 
1917 	if (!dev || !user_pfn || !npage)
1918 		return -EINVAL;
1919 
1920 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1921 		return -E2BIG;
1922 
1923 	group = vfio_group_get_from_dev(dev);
1924 	if (!group)
1925 		return -ENODEV;
1926 
1927 	ret = vfio_group_add_container_user(group);
1928 	if (ret)
1929 		goto err_unpin_pages;
1930 
1931 	container = group->container;
1932 	driver = container->iommu_driver;
1933 	if (likely(driver && driver->ops->unpin_pages))
1934 		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1935 					       npage);
1936 	else
1937 		ret = -ENOTTY;
1938 
1939 	vfio_group_try_dissolve_container(group);
1940 
1941 err_unpin_pages:
1942 	vfio_group_put(group);
1943 	return ret;
1944 }
1945 EXPORT_SYMBOL(vfio_unpin_pages);
1946 
1947 /*
1948  * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
1949  * VFIO group.
1950  *
1951  * The caller needs to call vfio_group_get_external_user() or
1952  * vfio_group_get_external_user_from_dev() prior to calling this interface,
1953  * so as to prevent the VFIO group from disposal in the middle of the call.
1954  * But it can keep the reference to the VFIO group for several calls into
1955  * this interface.
1956  * After finishing using of the VFIO group, the caller needs to release the
1957  * VFIO group by calling vfio_group_put_external_user().
1958  *
1959  * @group [in]		: VFIO group
1960  * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be pinned.
1961  * @npage [in]		: count of elements in user_iova_pfn array.
1962  *			  This count should not be greater
1963  *			  VFIO_PIN_PAGES_MAX_ENTRIES.
1964  * @prot [in]		: protection flags
1965  * @phys_pfn [out]	: array of host PFNs
1966  * Return error or number of pages pinned.
1967  */
vfio_group_pin_pages(struct vfio_group * group,unsigned long * user_iova_pfn,int npage,int prot,unsigned long * phys_pfn)1968 int vfio_group_pin_pages(struct vfio_group *group,
1969 			 unsigned long *user_iova_pfn, int npage,
1970 			 int prot, unsigned long *phys_pfn)
1971 {
1972 	struct vfio_container *container;
1973 	struct vfio_iommu_driver *driver;
1974 	int ret;
1975 
1976 	if (!group || !user_iova_pfn || !phys_pfn || !npage)
1977 		return -EINVAL;
1978 
1979 	if (group->dev_counter > 1)
1980 		return -EINVAL;
1981 
1982 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1983 		return -E2BIG;
1984 
1985 	container = group->container;
1986 	driver = container->iommu_driver;
1987 	if (likely(driver && driver->ops->pin_pages))
1988 		ret = driver->ops->pin_pages(container->iommu_data,
1989 					     group->iommu_group, user_iova_pfn,
1990 					     npage, prot, phys_pfn);
1991 	else
1992 		ret = -ENOTTY;
1993 
1994 	return ret;
1995 }
1996 EXPORT_SYMBOL(vfio_group_pin_pages);
1997 
1998 /*
1999  * Unpin a set of guest IOVA PFNs for a VFIO group.
2000  *
2001  * The caller needs to call vfio_group_get_external_user() or
2002  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2003  * so as to prevent the VFIO group from disposal in the middle of the call.
2004  * But it can keep the reference to the VFIO group for several calls into
2005  * this interface.
2006  * After finishing using of the VFIO group, the caller needs to release the
2007  * VFIO group by calling vfio_group_put_external_user().
2008  *
2009  * @group [in]		: vfio group
2010  * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be unpinned.
2011  * @npage [in]		: count of elements in user_iova_pfn array.
2012  *			  This count should not be greater than
2013  *			  VFIO_PIN_PAGES_MAX_ENTRIES.
2014  * Return error or number of pages unpinned.
2015  */
vfio_group_unpin_pages(struct vfio_group * group,unsigned long * user_iova_pfn,int npage)2016 int vfio_group_unpin_pages(struct vfio_group *group,
2017 			   unsigned long *user_iova_pfn, int npage)
2018 {
2019 	struct vfio_container *container;
2020 	struct vfio_iommu_driver *driver;
2021 	int ret;
2022 
2023 	if (!group || !user_iova_pfn || !npage)
2024 		return -EINVAL;
2025 
2026 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2027 		return -E2BIG;
2028 
2029 	container = group->container;
2030 	driver = container->iommu_driver;
2031 	if (likely(driver && driver->ops->unpin_pages))
2032 		ret = driver->ops->unpin_pages(container->iommu_data,
2033 					       user_iova_pfn, npage);
2034 	else
2035 		ret = -ENOTTY;
2036 
2037 	return ret;
2038 }
2039 EXPORT_SYMBOL(vfio_group_unpin_pages);
2040 
2041 
2042 /*
2043  * This interface allows the CPUs to perform some sort of virtual DMA on
2044  * behalf of the device.
2045  *
2046  * CPUs read/write from/into a range of IOVAs pointing to user space memory
2047  * into/from a kernel buffer.
2048  *
2049  * As the read/write of user space memory is conducted via the CPUs and is
2050  * not a real device DMA, it is not necessary to pin the user space memory.
2051  *
2052  * The caller needs to call vfio_group_get_external_user() or
2053  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2054  * so as to prevent the VFIO group from disposal in the middle of the call.
2055  * But it can keep the reference to the VFIO group for several calls into
2056  * this interface.
2057  * After finishing using of the VFIO group, the caller needs to release the
2058  * VFIO group by calling vfio_group_put_external_user().
2059  *
2060  * @group [in]		: VFIO group
2061  * @user_iova [in]	: base IOVA of a user space buffer
2062  * @data [in]		: pointer to kernel buffer
2063  * @len [in]		: kernel buffer length
2064  * @write		: indicate read or write
2065  * Return error code on failure or 0 on success.
2066  */
vfio_dma_rw(struct vfio_group * group,dma_addr_t user_iova,void * data,size_t len,bool write)2067 int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2068 		void *data, size_t len, bool write)
2069 {
2070 	struct vfio_container *container;
2071 	struct vfio_iommu_driver *driver;
2072 	int ret = 0;
2073 
2074 	if (!group || !data || len <= 0)
2075 		return -EINVAL;
2076 
2077 	container = group->container;
2078 	driver = container->iommu_driver;
2079 
2080 	if (likely(driver && driver->ops->dma_rw))
2081 		ret = driver->ops->dma_rw(container->iommu_data,
2082 					  user_iova, data, len, write);
2083 	else
2084 		ret = -ENOTTY;
2085 
2086 	return ret;
2087 }
2088 EXPORT_SYMBOL(vfio_dma_rw);
2089 
vfio_register_iommu_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2090 static int vfio_register_iommu_notifier(struct vfio_group *group,
2091 					unsigned long *events,
2092 					struct notifier_block *nb)
2093 {
2094 	struct vfio_container *container;
2095 	struct vfio_iommu_driver *driver;
2096 	int ret;
2097 
2098 	ret = vfio_group_add_container_user(group);
2099 	if (ret)
2100 		return -EINVAL;
2101 
2102 	container = group->container;
2103 	driver = container->iommu_driver;
2104 	if (likely(driver && driver->ops->register_notifier))
2105 		ret = driver->ops->register_notifier(container->iommu_data,
2106 						     events, nb);
2107 	else
2108 		ret = -ENOTTY;
2109 
2110 	vfio_group_try_dissolve_container(group);
2111 
2112 	return ret;
2113 }
2114 
vfio_unregister_iommu_notifier(struct vfio_group * group,struct notifier_block * nb)2115 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2116 					  struct notifier_block *nb)
2117 {
2118 	struct vfio_container *container;
2119 	struct vfio_iommu_driver *driver;
2120 	int ret;
2121 
2122 	ret = vfio_group_add_container_user(group);
2123 	if (ret)
2124 		return -EINVAL;
2125 
2126 	container = group->container;
2127 	driver = container->iommu_driver;
2128 	if (likely(driver && driver->ops->unregister_notifier))
2129 		ret = driver->ops->unregister_notifier(container->iommu_data,
2130 						       nb);
2131 	else
2132 		ret = -ENOTTY;
2133 
2134 	vfio_group_try_dissolve_container(group);
2135 
2136 	return ret;
2137 }
2138 
vfio_group_set_kvm(struct vfio_group * group,struct kvm * kvm)2139 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2140 {
2141 	group->kvm = kvm;
2142 	blocking_notifier_call_chain(&group->notifier,
2143 				VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2144 }
2145 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2146 
vfio_register_group_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2147 static int vfio_register_group_notifier(struct vfio_group *group,
2148 					unsigned long *events,
2149 					struct notifier_block *nb)
2150 {
2151 	int ret;
2152 	bool set_kvm = false;
2153 
2154 	if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2155 		set_kvm = true;
2156 
2157 	/* clear known events */
2158 	*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2159 
2160 	/* refuse to continue if still events remaining */
2161 	if (*events)
2162 		return -EINVAL;
2163 
2164 	ret = vfio_group_add_container_user(group);
2165 	if (ret)
2166 		return -EINVAL;
2167 
2168 	ret = blocking_notifier_chain_register(&group->notifier, nb);
2169 
2170 	/*
2171 	 * The attaching of kvm and vfio_group might already happen, so
2172 	 * here we replay once upon registration.
2173 	 */
2174 	if (!ret && set_kvm && group->kvm)
2175 		blocking_notifier_call_chain(&group->notifier,
2176 					VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2177 
2178 	vfio_group_try_dissolve_container(group);
2179 
2180 	return ret;
2181 }
2182 
vfio_unregister_group_notifier(struct vfio_group * group,struct notifier_block * nb)2183 static int vfio_unregister_group_notifier(struct vfio_group *group,
2184 					 struct notifier_block *nb)
2185 {
2186 	int ret;
2187 
2188 	ret = vfio_group_add_container_user(group);
2189 	if (ret)
2190 		return -EINVAL;
2191 
2192 	ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2193 
2194 	vfio_group_try_dissolve_container(group);
2195 
2196 	return ret;
2197 }
2198 
vfio_register_notifier(struct device * dev,enum vfio_notify_type type,unsigned long * events,struct notifier_block * nb)2199 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2200 			   unsigned long *events, struct notifier_block *nb)
2201 {
2202 	struct vfio_group *group;
2203 	int ret;
2204 
2205 	if (!dev || !nb || !events || (*events == 0))
2206 		return -EINVAL;
2207 
2208 	group = vfio_group_get_from_dev(dev);
2209 	if (!group)
2210 		return -ENODEV;
2211 
2212 	switch (type) {
2213 	case VFIO_IOMMU_NOTIFY:
2214 		ret = vfio_register_iommu_notifier(group, events, nb);
2215 		break;
2216 	case VFIO_GROUP_NOTIFY:
2217 		ret = vfio_register_group_notifier(group, events, nb);
2218 		break;
2219 	default:
2220 		ret = -EINVAL;
2221 	}
2222 
2223 	vfio_group_put(group);
2224 	return ret;
2225 }
2226 EXPORT_SYMBOL(vfio_register_notifier);
2227 
vfio_unregister_notifier(struct device * dev,enum vfio_notify_type type,struct notifier_block * nb)2228 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2229 			     struct notifier_block *nb)
2230 {
2231 	struct vfio_group *group;
2232 	int ret;
2233 
2234 	if (!dev || !nb)
2235 		return -EINVAL;
2236 
2237 	group = vfio_group_get_from_dev(dev);
2238 	if (!group)
2239 		return -ENODEV;
2240 
2241 	switch (type) {
2242 	case VFIO_IOMMU_NOTIFY:
2243 		ret = vfio_unregister_iommu_notifier(group, nb);
2244 		break;
2245 	case VFIO_GROUP_NOTIFY:
2246 		ret = vfio_unregister_group_notifier(group, nb);
2247 		break;
2248 	default:
2249 		ret = -EINVAL;
2250 	}
2251 
2252 	vfio_group_put(group);
2253 	return ret;
2254 }
2255 EXPORT_SYMBOL(vfio_unregister_notifier);
2256 
vfio_group_iommu_domain(struct vfio_group * group)2257 struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
2258 {
2259 	struct vfio_container *container;
2260 	struct vfio_iommu_driver *driver;
2261 
2262 	if (!group)
2263 		return ERR_PTR(-EINVAL);
2264 
2265 	container = group->container;
2266 	driver = container->iommu_driver;
2267 	if (likely(driver && driver->ops->group_iommu_domain))
2268 		return driver->ops->group_iommu_domain(container->iommu_data,
2269 						       group->iommu_group);
2270 
2271 	return ERR_PTR(-ENOTTY);
2272 }
2273 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
2274 
2275 /*
2276  * Module/class support
2277  */
vfio_devnode(struct device * dev,umode_t * mode)2278 static char *vfio_devnode(struct device *dev, umode_t *mode)
2279 {
2280 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2281 }
2282 
2283 static struct miscdevice vfio_dev = {
2284 	.minor = VFIO_MINOR,
2285 	.name = "vfio",
2286 	.fops = &vfio_fops,
2287 	.nodename = "vfio/vfio",
2288 	.mode = S_IRUGO | S_IWUGO,
2289 };
2290 
vfio_init(void)2291 static int __init vfio_init(void)
2292 {
2293 	int ret;
2294 
2295 	ida_init(&vfio.group_ida);
2296 	mutex_init(&vfio.group_lock);
2297 	mutex_init(&vfio.iommu_drivers_lock);
2298 	INIT_LIST_HEAD(&vfio.group_list);
2299 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2300 
2301 	ret = misc_register(&vfio_dev);
2302 	if (ret) {
2303 		pr_err("vfio: misc device register failed\n");
2304 		return ret;
2305 	}
2306 
2307 	/* /dev/vfio/$GROUP */
2308 	vfio.class = class_create(THIS_MODULE, "vfio");
2309 	if (IS_ERR(vfio.class)) {
2310 		ret = PTR_ERR(vfio.class);
2311 		goto err_class;
2312 	}
2313 
2314 	vfio.class->devnode = vfio_devnode;
2315 
2316 	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2317 	if (ret)
2318 		goto err_alloc_chrdev;
2319 
2320 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2321 
2322 #ifdef CONFIG_VFIO_NOIOMMU
2323 	vfio_register_iommu_driver(&vfio_noiommu_ops);
2324 #endif
2325 	return 0;
2326 
2327 err_alloc_chrdev:
2328 	class_destroy(vfio.class);
2329 	vfio.class = NULL;
2330 err_class:
2331 	misc_deregister(&vfio_dev);
2332 	return ret;
2333 }
2334 
vfio_cleanup(void)2335 static void __exit vfio_cleanup(void)
2336 {
2337 	WARN_ON(!list_empty(&vfio.group_list));
2338 
2339 #ifdef CONFIG_VFIO_NOIOMMU
2340 	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2341 #endif
2342 	ida_destroy(&vfio.group_ida);
2343 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2344 	class_destroy(vfio.class);
2345 	vfio.class = NULL;
2346 	misc_deregister(&vfio_dev);
2347 	xa_destroy(&vfio_device_set_xa);
2348 }
2349 
2350 module_init(vfio_init);
2351 module_exit(vfio_cleanup);
2352 
2353 MODULE_VERSION(DRIVER_VERSION);
2354 MODULE_LICENSE("GPL v2");
2355 MODULE_AUTHOR(DRIVER_AUTHOR);
2356 MODULE_DESCRIPTION(DRIVER_DESC);
2357 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2358 MODULE_ALIAS("devname:vfio/vfio");
2359 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2360