No subject


Thu Oct 27 00:53:32 UTC 2011


        Fingers crossed, this is the last RFC for VFIO, but we need
        the iommu group support before this can go upstream
        (http://lkml.indiana.edu/hypermail/linux/kernel/1110.2/02303.html),
        hoping this helps push that along.

hat's the one bit keeping me from doing a non-RFC of the core, besides
fixing all these comments ;)

> > +		return NULL;
> > +
> > +	list_for_each(gpos, &vfio.group_list) {
> > +		struct vfio_group *group;
> > +		struct list_head *dpos;
> > +
> > +		group = list_entry(gpos, struct vfio_group, group_next);
> > +
> > +		if (group->groupid != groupid)
> > +			continue;
> > +
> > +		list_for_each(dpos, &group->device_list) {
> > +			struct vfio_device *device;
> > +
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +
> > +			if (device->dev == dev)
> > +				return device;
> > +		}
> > +	}
> > +	return NULL;
> > +}
> > +
> > +/* All release paths simply decrement the refcnt, attempt to teardown
> > + * the iommu and merged groups, and wakeup anything that might be
> > + * waiting if we successfully dissolve anything. */
> > +static int vfio_do_release(int *refcnt, struct vfio_iommu *iommu)
> > +{
> > +	bool wake;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	(*refcnt)--;
> > +	wake = (__vfio_try_dissolve_iommu(iommu) == 0);
> > +
> > +	mutex_unlock(&vfio.lock);
> > +
> > +	if (wake)
> > +		wake_up(&vfio.release_q);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Device fops - passthrough to vfio device driver w/ device_data
> > + */
> > +static int vfio_device_release(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	vfio_do_release(&device->refcnt, device->iommu);
> > +
> > +	device->ops->put(device->device_data);
> > +
> > +	return 0;
> > +}
> > +
> > +static long vfio_device_unl_ioctl(struct file *filep,
> > +				  unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->ioctl(device->device_data, cmd, arg);
> > +}
> > +
> > +static ssize_t vfio_device_read(struct file *filep, char __user *buf,
> > +				size_t count, loff_t *ppos)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->read(device->device_data, buf, count, ppos);
> > +}
> > +
> > +static ssize_t vfio_device_write(struct file *filep, const char __user *buf,
> > +				 size_t count, loff_t *ppos)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->write(device->device_data, buf, count, ppos);
> > +}
> > +
> > +static int vfio_device_mmap(struct file *filep, struct vm_area_struct *vma)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->mmap(device->device_data, vma);
> > +}
> > +	
> > +#ifdef CONFIG_COMPAT
> > +static long vfio_device_compat_ioctl(struct file *filep,
> > +				     unsigned int cmd, unsigned long arg)
> > +{
> > +	arg = (unsigned long)compat_ptr(arg);
> > +	return vfio_device_unl_ioctl(filep, cmd, arg);
> > +}
> > +#endif	/* CONFIG_COMPAT */
> > +
> > +const struct file_operations vfio_device_fops = {
> > +	.owner		= THIS_MODULE,
> > +	.release	= vfio_device_release,
> > +	.read		= vfio_device_read,
> > +	.write		= vfio_device_write,
> > +	.unlocked_ioctl	= vfio_device_unl_ioctl,
> > +#ifdef CONFIG_COMPAT
> > +	.compat_ioctl	= vfio_device_compat_ioctl,
> > +#endif
> > +	.mmap		= vfio_device_mmap,
> > +};
> > +
> > +/*
> > + * Group fops
> > + */
> > +static int vfio_group_open(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_group *group;
> > +	int ret = 0;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	group = idr_find(&vfio.idr, iminor(inode));
> > +
> > +	if (!group) {
> > +		ret = -ENODEV;
> > +		goto out;
> > +	}
> > +
> > +	filep->private_data = group;
> > +
> > +	if (!group->iommu) {
> > +		struct vfio_iommu *iommu;
> > +
> > +		iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> > +		if (!iommu) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +		INIT_LIST_HEAD(&iommu->group_list);
> > +		INIT_LIST_HEAD(&iommu->dm_list);
> > +		mutex_init(&iommu->dgate);
> > +		iommu->bus = group->bus;
> > +		__vfio_group_set_iommu(group, iommu);
> > +	}
> > +	group->refcnt++;
> > +
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +
> > +	return ret;
> > +}
> > +
> > +static int vfio_group_release(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_group *group = filep->private_data;
> > +
> > +	return vfio_do_release(&group->refcnt, group->iommu);
> > +}
> > +
> > +/* Attempt to merge the group pointed to by fd into group.  The merge-ee
> > + * group must not have an iommu or any devices open because we cannot
> > + * maintain that context across the merge.  The merge-er group can be
> > + * in use. */
> > +static int vfio_group_merge(struct vfio_group *group, int fd)
> > +{
> > +	struct vfio_group *new;
> > +	struct vfio_iommu *old_iommu;
> > +	struct file *file;
> > +	int ret = 0;
> > +	bool opened = false;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	file = fget(fd);
> > +	if (!file) {
> > +		ret = -EBADF;
> > +		goto out_noput;
> > +	}
> > +
> > +	/* Sanity check, is this really our fd? */
> > +	if (file->f_op != &vfio_group_fops) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	new = file->private_data;
> > +
> > +	if (!new || new == group || !new->iommu ||
> > +	    new->iommu->domain || new->bus != group->bus) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* We need to attach all the devices to each domain separately
> > +	 * in order to validate that the capabilities match for both.  */
> > +	ret = __vfio_open_iommu(new->iommu);
> > +	if (ret)
> > +		goto out;
> > +
> > +	if (!group->iommu->domain) {
> > +		ret = __vfio_open_iommu(group->iommu);
> > +		if (ret)
> > +			goto out;
> > +		opened = true;
> > +	}
> > +
> > +	/* If cache coherency doesn't match we'd potentialy need to
> > +	 * remap existing iommu mappings in the merge-er domain.
> > +	 * Poor return to bother trying to allow this currently. */
> > +	if (iommu_domain_has_cap(group->iommu->domain,
> > +				 IOMMU_CAP_CACHE_COHERENCY) !=
> > +	    iommu_domain_has_cap(new->iommu->domain,
> > +				 IOMMU_CAP_CACHE_COHERENCY)) {
> > +		__vfio_close_iommu(new->iommu);
> > +		if (opened)
> > +			__vfio_close_iommu(group->iommu);
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* Close the iommu for the merge-ee and attach all its devices
> > +	 * to the merge-er iommu. */
> > +	__vfio_close_iommu(new->iommu);
> > +
> > +	ret = __vfio_iommu_attach_group(group->iommu, new);
> > +	if (ret)
> > +		goto out;
> > +
> > +	/* set_iommu unlinks new from the iommu, so save a pointer to it */
> > +	old_iommu = new->iommu;
> > +	__vfio_group_set_iommu(new, group->iommu);
> > +	kfree(old_iommu);
> > +
> > +out:
> > +	fput(file);
> > +out_noput:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Unmerge the group pointed to by fd from group. */
> > +static int vfio_group_unmerge(struct vfio_group *group, int fd)
> > +{
> > +	struct vfio_group *new;
> > +	struct vfio_iommu *new_iommu;
> > +	struct file *file;
> > +	int ret = 0;
> > +
> > +	/* Since the merge-out group is already opened, it needs to
> > +	 * have an iommu struct associated with it. */
> > +	new_iommu = kzalloc(sizeof(*new_iommu), GFP_KERNEL);
> > +	if (!new_iommu)
> > +		return -ENOMEM;
> > +
> > +	INIT_LIST_HEAD(&new_iommu->group_list);
> > +	INIT_LIST_HEAD(&new_iommu->dm_list);
> > +	mutex_init(&new_iommu->dgate);
> > +	new_iommu->bus = group->bus;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	file = fget(fd);
> > +	if (!file) {
> > +		ret = -EBADF;
> > +		goto out_noput;
> > +	}
> > +
> > +	/* Sanity check, is this really our fd? */
> > +	if (file->f_op != &vfio_group_fops) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	new = file->private_data;
> > +	if (!new || new == group || new->iommu != group->iommu) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* We can't merge-out a group with devices still in use. */
> > +	if (__vfio_group_devs_inuse(new)) {
> > +		ret = -EBUSY;
> > +		goto out;
> > +	}
> > +
> > +	__vfio_iommu_detach_group(group->iommu, new);
> > +	__vfio_group_set_iommu(new, new_iommu);
> > +
> > +out:
> > +	fput(file);
> > +out_noput:
> > +	if (ret)
> > +		kfree(new_iommu);
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Get a new iommu file descriptor.  This will open the iommu, setting
> > + * the current->mm ownership if it's not already set. */
> > +static int vfio_group_get_iommu_fd(struct vfio_group *group)
> > +{
> > +	int ret = 0;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (!group->iommu->domain) {
> > +		ret = __vfio_open_iommu(group->iommu);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,
> > +			       group->iommu, O_RDWR);
> > +	if (ret < 0)
> > +		goto out;
> > +
> > +	group->iommu->refcnt++;
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Get a new device file descriptor.  This will open the iommu, setting
> > + * the current->mm ownership if it's not already set.  It's difficult to
> > + * specify the requirements for matching a user supplied buffer to a
> > + * device, so we use a vfio driver callback to test for a match.  For
> > + * PCI, dev_name(dev) is unique, but other drivers may require including
> > + * a parent device string. */
> > +static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
> > +{
> > +	struct vfio_iommu *iommu = group->iommu;
> > +	struct list_head *gpos;
> > +	int ret = -ENODEV;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (!iommu->domain) {
> > +		ret = __vfio_open_iommu(iommu);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	list_for_each(gpos, &iommu->group_list) {
> > +		struct list_head *dpos;
> > +
> > +		group = list_entry(gpos, struct vfio_group, iommu_next);
> > +
> > +		list_for_each(dpos, &group->device_list) {
> > +			struct vfio_device *device;
> > +
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +
> > +			if (device->ops->match(device->dev, buf)) {
> > +				struct file *file;
> > +
> > +				if (device->ops->get(device->device_data)) {
> > +					ret = -EFAULT;
> > +					goto out;
> > +				}
> > +
> > +				/* We can't use anon_inode_getfd(), like above
> > +				 * because we need to modify the f_mode flags
> > +				 * directly to allow more than just ioctls */
> > +				ret = get_unused_fd();
> > +				if (ret < 0) {
> > +					device->ops->put(device->device_data);
> > +					goto out;
> > +				}
> > +
> > +				file = anon_inode_getfile("[vfio-device]",
> > +							  &vfio_device_fops,
> > +							  device, O_RDWR);
> > +				if (IS_ERR(file)) {
> > +					put_unused_fd(ret);
> > +					ret = PTR_ERR(file);
> > +					device->ops->put(device->device_data);
> > +					goto out;
> > +				}
> > +
> > +				/* Todo: add an anon_inode interface to do
> > +				 * this.  Appears to be missing by lack of
> > +				 * need rather than explicitly prevented.
> > +				 * Now there's need. */
> > +				file->f_mode |= (FMODE_LSEEK |
> > +						 FMODE_PREAD |
> > +						 FMODE_PWRITE);
> > +
> > +				fd_install(ret, file);
> > +
> > +				device->refcnt++;
> > +				goto out;
> > +			}
> > +		}
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +static long vfio_group_unl_ioctl(struct file *filep,
> > +				 unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_group *group = filep->private_data;
> > +
> > +	if (cmd == VFIO_GROUP_GET_FLAGS) {
> > +		u64 flags = 0;
> > +
> > +		mutex_lock(&vfio.lock);
> > +		if (__vfio_iommu_viable(group->iommu))
> > +			flags |= VFIO_GROUP_FLAGS_VIABLE;
> > +		mutex_unlock(&vfio.lock);
> > +
> > +		if (group->iommu->mm)
> > +			flags |= VFIO_GROUP_FLAGS_MM_LOCKED;
> > +
> > +		return put_user(flags, (u64 __user *)arg);
> > +	}
> > +		
> > +	/* Below commands are restricted once the mm is set */
> > +	if (group->iommu->mm && group->iommu->mm != current->mm)
> > +		return -EPERM;
> > +
> > +	if (cmd == VFIO_GROUP_MERGE || cmd == VFIO_GROUP_UNMERGE) {
> > +		int fd;
> > +		
> > +		if (get_user(fd, (int __user *)arg))
> > +			return -EFAULT;
> > +		if (fd < 0)
> > +			return -EINVAL;
> > +
> > +		if (cmd == VFIO_GROUP_MERGE)
> > +			return vfio_group_merge(group, fd);
> > +		else
> > +			return vfio_group_unmerge(group, fd);
> > +	} else if (cmd == VFIO_GROUP_GET_IOMMU_FD) {
> > +		return vfio_group_get_iommu_fd(group);
> > +	} else if (cmd == VFIO_GROUP_GET_DEVICE_FD) {
> > +		char *buf;
> > +		int ret;
> > +
> > +		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
> > +		if (IS_ERR(buf))
> > +			return PTR_ERR(buf);
> > +
> > +		ret = vfio_group_get_device_fd(group, buf);
> > +		kfree(buf);
> > +		return ret;
> > +	}
> > +
> > +	return -ENOSYS;
> > +}
> > +
> > +#ifdef CONFIG_COMPAT
> > +static long vfio_group_compat_ioctl(struct file *filep,
> > +				    unsigned int cmd, unsigned long arg)
> > +{
> > +	arg = (unsigned long)compat_ptr(arg);
> > +	return vfio_group_unl_ioctl(filep, cmd, arg);
> > +}
> > +#endif	/* CONFIG_COMPAT */
> > +
> > +static const struct file_operations vfio_group_fops = {
> > +	.owner		= THIS_MODULE,
> > +	.open		= vfio_group_open,
> > +	.release	= vfio_group_release,
> > +	.unlocked_ioctl	= vfio_group_unl_ioctl,
> > +#ifdef CONFIG_COMPAT
> > +	.compat_ioctl	= vfio_group_compat_ioctl,
> > +#endif
> > +};
> > +
> > +/* iommu fd release hook */
> > +int vfio_release_iommu(struct vfio_iommu *iommu)
> > +{
> > +	return vfio_do_release(&iommu->refcnt, iommu);
> > +}
> > +
> > +/*
> > + * VFIO driver API
> > + */
> > +
> > +/* Add a new device to the vfio framework with associated vfio driver
> > + * callbacks.  This is the entry point for vfio drivers to register devices. */
> > +int vfio_group_add_dev(struct device *dev, const struct vfio_device_ops *ops)
> > +{
> > +	struct list_head *pos;
> > +	struct vfio_group *group = NULL;
> > +	struct vfio_device *device = NULL;
> > +	unsigned int groupid;
> > +	int ret = 0;
> > +	bool new_group = false;
> > +
> > +	if (!ops)
> > +		return -EINVAL;
> > +
> > +	if (iommu_device_group(dev, &groupid))
> > +		return -ENODEV;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	list_for_each(pos, &vfio.group_list) {
> > +		group = list_entry(pos, struct vfio_group, group_next);
> > +		if (group->groupid == groupid)
> > +			break;
> > +		group = NULL;
> > +	}
> > +
> > +	if (!group) {
> > +		int minor;
> > +
> > +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		group = kzalloc(sizeof(*group), GFP_KERNEL);
> > +		if (!group) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		group->groupid = groupid;
> > +		INIT_LIST_HEAD(&group->device_list);
> > +
> > +		ret = idr_get_new(&vfio.idr, group, &minor);
> > +		if (ret == 0 && minor > MINORMASK) {
> > +			idr_remove(&vfio.idr, minor);
> > +			kfree(group);
> > +			ret = -ENOSPC;
> > +			goto out;
> > +		}
> > +
> > +		group->devt = MKDEV(MAJOR(vfio.devt), minor);
> > +		device_create(vfio.class, NULL, group->devt,
> > +			      group, "%u", groupid);
> > +
> > +		group->bus = dev->bus;
> 
> 
> Oh, so that is how the IOMMU iommu_ops get copied! You might
> want to mention that - I was not sure where the 'handoff' is
> was done to insert a device so that it can do iommu_ops properly.
> 
> Ok, so the time when a device is detected whether it can do
> IOMMU is when we try to open it - as that is when iommu_domain_alloc
> is called which can return NULL if the iommu_ops is not set.
> 
> So what about devices that don't have an iommu_ops? Say they
> are using SWIOTLB? (like the AMD-Vi sometimes does if the
> device is not on its list).
> 
> Can we use iommu_present?

I'm not sure I'm following your revelation ;)  Take a look at the
pointer to iommu_device_group I pasted above, or these:

https://github.com/awilliam/linux-vfio/commit/37dd08c90d149caaed7779d4f38850a8f7ed0fa5
https://github.com/awilliam/linux-vfio/commit/63ca8543533d8130db23d7949133e548c3891c97
https://github.com/awilliam/linux-vfio/commit/8d7d70eb8e714fbf8710848a06f8cab0c741631e

That call includes an iommu_present() check, so if there's no iommu or
the iommu can't provide a groupid, the device is skipped over from vfio
(can't be used).

So the ordering is:

 - bus driver registers device
   - if it has an iommu group, add it to the vfio device/group tracking

 - group gets opened
   - user gets iommu or device fd results in iommu_domain_alloc

Devices without iommu_ops don't get to play in the vfio world.

> > +		list_add(&group->group_next, &vfio.group_list);
> > +		new_group = true;
> > +	} else {
> > +		if (group->bus != dev->bus) {
> > +			printk(KERN_WARNING
> > +			       "Error: IOMMU group ID conflict.  Group ID %u "
> > +				"on both bus %s and %s\n", groupid,
> > +				group->bus->name, dev->bus->name);
> > +			ret = -EFAULT;
> > +			goto out;
> > +		}
> > +
> > +		list_for_each(pos, &group->device_list) {
> > +			device = list_entry(pos,
> > +					    struct vfio_device, device_next);
> > +			if (device->dev == dev)
> > +				break;
> > +			device = NULL;
> > +		}
> > +	}
> > +
> > +	if (!device) {
> > +		if (__vfio_group_devs_inuse(group) ||
> > +		    (group->iommu && group->iommu->refcnt)) {
> > +			printk(KERN_WARNING
> > +			       "Adding device %s to group %u while group is already in use!!\n",
> > +			       dev_name(dev), group->groupid);
> > +			/* XXX How to prevent other drivers from claiming? */
> > +		}
> > +
> > +		device = kzalloc(sizeof(*device), GFP_KERNEL);
> > +		if (!device) {
> > +			/* If we just created this group, tear it down */
> > +			if (new_group) {
> > +				list_del(&group->group_next);
> > +				device_destroy(vfio.class, group->devt);
> > +				idr_remove(&vfio.idr, MINOR(group->devt));
> > +				kfree(group);
> > +			}
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		list_add(&device->device_next, &group->device_list);
> > +		device->dev = dev;
> > +		device->ops = ops;
> > +		device->iommu = group->iommu; /* NULL if new */
> > +		__vfio_iommu_attach_dev(group->iommu, device);
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_group_add_dev);
> > +
> > +/* Remove a device from the vfio framework */
> > +void vfio_group_del_dev(struct device *dev)
> > +{
> > +	struct list_head *pos;
> > +	struct vfio_group *group = NULL;
> > +	struct vfio_device *device = NULL;
> > +	unsigned int groupid;
> > +
> > +	if (iommu_device_group(dev, &groupid))
> > +		return;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	list_for_each(pos, &vfio.group_list) {
> > +		group = list_entry(pos, struct vfio_group, group_next);
> > +		if (group->groupid == groupid)
> > +			break;
> > +		group = NULL;
> > +	}
> > +
> > +	if (!group)
> > +		goto out;
> > +
> > +	list_for_each(pos, &group->device_list) {
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		if (device->dev == dev)
> > +			break;
> > +		device = NULL;
> > +	}
> > +
> > +	if (!device)
> > +		goto out;
> > +
> > +	BUG_ON(device->refcnt);
> > +
> > +	if (device->attached)
> > +		__vfio_iommu_detach_dev(group->iommu, device);
> > +
> > +	list_del(&device->device_next);
> > +	kfree(device);
> > +
> > +	/* If this was the only device in the group, remove the group.
> > +	 * Note that we intentionally unmerge empty groups here if the
> > +	 * group fd isn't opened. */
> > +	if (list_empty(&group->device_list) && group->refcnt == 0) {
> > +		struct vfio_iommu *iommu = group->iommu;
> > +
> > +		if (iommu) {
> > +			__vfio_group_set_iommu(group, NULL);
> > +			__vfio_try_dissolve_iommu(iommu);
> > +		}
> > +
> > +		device_destroy(vfio.class, group->devt);
> > +		idr_remove(&vfio.idr, MINOR(group->devt));
> > +		list_del(&group->group_next);
> > +		kfree(group);
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_group_del_dev);
> > +
> > +/* When a device is bound to a vfio device driver (ex. vfio-pci), this
> > + * entry point is used to mark the device usable (viable).  The vfio
> > + * device driver associates a private device_data struct with the device
> > + * here, which will later be return for vfio_device_fops callbacks. */
> > +int vfio_bind_dev(struct device *dev, void *device_data)
> > +{
> > +	struct vfio_device *device;
> > +	int ret = -EINVAL;
> > +
> > +	BUG_ON(!device_data);
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	device = __vfio_lookup_dev(dev);
> > +
> > +	BUG_ON(!device);
> > +
> > +	ret = dev_set_drvdata(dev, device);
> > +	if (!ret)
> > +		device->device_data = device_data;
> > +
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_bind_dev);
> > +
> > +/* A device is only removeable if the iommu for the group is not in use. */
> > +static bool vfio_device_removeable(struct vfio_device *device)
> > +{
> > +	bool ret = true;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (device->iommu && __vfio_iommu_inuse(device->iommu))
> > +		ret = false;
> > +
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Notify vfio that a device is being unbound from the vfio device driver
> > + * and return the device private device_data pointer.  If the group is
> > + * in use, we need to block or take other measures to make it safe for
> > + * the device to be removed from the iommu. */
> > +void *vfio_unbind_dev(struct device *dev)
> > +{
> > +	struct vfio_device *device = dev_get_drvdata(dev);
> > +	void *device_data;
> > +
> > +	BUG_ON(!device);
> > +
> > +again:
> > +	if (!vfio_device_removeable(device)) {
> > +		/* XXX signal for all devices in group to be removed or
> > +		 * resort to killing the process holding the device fds.
> > +		 * For now just block waiting for releases to wake us. */
> > +		wait_event(vfio.release_q, vfio_device_removeable(device));
> > +	}
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	/* Need to re-check that the device is still removeable under lock. */
> > +	if (device->iommu && __vfio_iommu_inuse(device->iommu)) {
> > +		mutex_unlock(&vfio.lock);
> > +		goto again;
> > +	}
> > +
> > +	device_data = device->device_data;
> > +
> > +	device->device_data = NULL;
> > +	dev_set_drvdata(dev, NULL);
> > +
> > +	mutex_unlock(&vfio.lock);
> > +	return device_data;
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_unbind_dev);
> > +
> > +/*
> > + * Module/class support
> > + */
> > +static void vfio_class_release(struct kref *kref)
> > +{
> > +	class_destroy(vfio.class);
> > +	vfio.class = NULL;
> > +}
> > +
> > +static char *vfio_devnode(struct device *dev, mode_t *mode)
> > +{
> > +	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
> > +}
> > +
> > +static int __init vfio_init(void)
> > +{
> > +	int ret;
> > +
> > +	idr_init(&vfio.idr);
> > +	mutex_init(&vfio.lock);
> > +	INIT_LIST_HEAD(&vfio.group_list);
> > +	init_waitqueue_head(&vfio.release_q);
> > +
> > +	kref_init(&vfio.kref);
> > +	vfio.class = class_create(THIS_MODULE, "vfio");
> > +	if (IS_ERR(vfio.class)) {
> > +		ret = PTR_ERR(vfio.class);
> > +		goto err_class;
> > +	}
> > +
> > +	vfio.class->devnode = vfio_devnode;
> > +
> > +	/* FIXME - how many minors to allocate... all of them! */
> > +	ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
> > +	if (ret)
> > +		goto err_chrdev;
> > +
> > +	cdev_init(&vfio.cdev, &vfio_group_fops);
> > +	ret = cdev_add(&vfio.cdev, vfio.devt, MINORMASK);
> > +	if (ret)
> > +		goto err_cdev;
> > +
> > +	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
> > +
> > +	return 0;
> > +
> > +err_cdev:
> > +	unregister_chrdev_region(vfio.devt, MINORMASK);
> > +err_chrdev:
> > +	kref_put(&vfio.kref, vfio_class_release);
> > +err_class:
> > +	return ret;
> > +}
> > +
> > +static void __exit vfio_cleanup(void)
> > +{
> > +	struct list_head *gpos, *gppos;
> > +
> > +	list_for_each_safe(gpos, gppos, &vfio.group_list) {
> > +		struct vfio_group *group;
> > +		struct list_head *dpos, *dppos;
> > +
> > +		group = list_entry(gpos, struct vfio_group, group_next);
> > +
> > +		list_for_each_safe(dpos, dppos, &group->device_list) {
> > +			struct vfio_device *device;
> > +
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +			vfio_group_del_dev(device->dev);
> > +		}
> > +	}
> > +
> > +	idr_destroy(&vfio.idr);
> > +	cdev_del(&vfio.cdev);
> > +	unregister_chrdev_region(vfio.devt, MINORMASK);
> > +	kref_put(&vfio.kref, vfio_class_release);
> > +}
> > +
> > +module_init(vfio_init);
> > +module_exit(vfio_cleanup);
> > +
> > +MODULE_VERSION(DRIVER_VERSION);
> > +MODULE_LICENSE("GPL v2");
> > +MODULE_AUTHOR(DRIVER_AUTHOR);
> > +MODULE_DESCRIPTION(DRIVER_DESC);
> > diff --git a/drivers/vfio/vfio_private.h b/drivers/vfio/vfio_private.h
> > new file mode 100644
> > index 0000000..350ad67
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_private.h
> > @@ -0,0 +1,34 @@
> > +/*
> > + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> > + *     Author: Alex Williamson <alex.williamson at redhat.com>
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * Derived from original vfio:
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs at cisco.com
> > + */
> > +
> > +#include <linux/list.h>
> > +#include <linux/mutex.h>
> > +
> > +#ifndef VFIO_PRIVATE_H
> > +#define VFIO_PRIVATE_H
> > +
> > +struct vfio_iommu {
> > +	struct iommu_domain		*domain;
> > +	struct bus_type			*bus;
> > +	struct mutex			dgate;
> > +	struct list_head		dm_list;
> > +	struct mm_struct		*mm;
> > +	struct list_head		group_list;
> > +	int				refcnt;
> > +	bool				cache;
> > +};
> > +
> > +extern int vfio_release_iommu(struct vfio_iommu *iommu);
> > +extern void vfio_iommu_unmapall(struct vfio_iommu *iommu);
> > +
> > +#endif /* VFIO_PRIVATE_H */
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > new file mode 100644
> > index 0000000..4269b08
> > --- /dev/null
> > +++ b/include/linux/vfio.h
> > @@ -0,0 +1,155 @@
> > +/*
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs at cisco.com
> > + *
> > + * This program is free software; you may redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; version 2 of the License.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + *
> > + * Portions derived from drivers/uio/uio.c:
> > + * Copyright(C) 2005, Benedikt Spranger <b.spranger at linutronix.de>
> > + * Copyright(C) 2005, Thomas Gleixner <tglx at linutronix.de>
> > + * Copyright(C) 2006, Hans J. Koch <hjk at linutronix.de>
> > + * Copyright(C) 2006, Greg Kroah-Hartman <greg at kroah.com>
> > + *
> > + * Portions derived from drivers/uio/uio_pci_generic.c:
> > + * Copyright (C) 2009 Red Hat, Inc.
> > + * Author: Michael S. Tsirkin <mst at redhat.com>
> > + */
> > +#include <linux/types.h>
> > +
> > +#ifndef VFIO_H
> > +#define VFIO_H
> > +
> > +#ifdef __KERNEL__
> > +
> > +struct vfio_device_ops {
> > +	bool			(*match)(struct device *, char *);
> > +	int			(*get)(void *);
> > +	void			(*put)(void *);
> > +	ssize_t			(*read)(void *, char __user *,
> > +					size_t, loff_t *);
> > +	ssize_t			(*write)(void *, const char __user *,
> > +					 size_t, loff_t *);
> > +	long			(*ioctl)(void *, unsigned int, unsigned long);
> > +	int			(*mmap)(void *, struct vm_area_struct *);
> > +};
> > +
> > +extern int vfio_group_add_dev(struct device *device,
> > +			      const struct vfio_device_ops *ops);
> > +extern void vfio_group_del_dev(struct device *device);
> > +extern int vfio_bind_dev(struct device *device, void *device_data);
> > +extern void *vfio_unbind_dev(struct device *device);
> > +
> > +#endif /* __KERNEL__ */
> > +
> > +/*
> > + * VFIO driver - allow mapping and use of certain devices
> > + * in unprivileged user processes. (If IOMMU is present)
> > + * Especially useful for Virtual Function parts of SR-IOV devices
> > + */
> > +
> > +
> > +/* Kernel & User level defines for ioctls */
> > +
> > +#define VFIO_GROUP_GET_FLAGS		_IOR(';', 100, __u64)
> 
> > + #define VFIO_GROUP_FLAGS_VIABLE	(1 << 0)
> > + #define VFIO_GROUP_FLAGS_MM_LOCKED	(1 << 1)
> > +#define VFIO_GROUP_MERGE		_IOW(';', 101, int)
> > +#define VFIO_GROUP_UNMERGE		_IOW(';', 102, int)
> > +#define VFIO_GROUP_GET_IOMMU_FD		_IO(';', 103)
> > +#define VFIO_GROUP_GET_DEVICE_FD	_IOW(';', 104, char *)
> > +
> > +/*
> > + * Structure for DMA mapping of user buffers
> > + * vaddr, dmaaddr, and size must all be page aligned
> > + */
> > +struct vfio_dma_map {
> > +	__u64	len;		/* length of structure */
> > +	__u64	vaddr;		/* process virtual addr */
> > +	__u64	dmaaddr;	/* desired and/or returned dma address */
> > +	__u64	size;		/* size in bytes */
> > +	__u64	flags;
> > +#define	VFIO_DMA_MAP_FLAG_WRITE		(1 << 0) /* req writeable DMA mem */
> > +};
> > +
> > +#define	VFIO_IOMMU_GET_FLAGS		_IOR(';', 105, __u64)
> > + /* Does the IOMMU support mapping any IOVA to any virtual address? */
> > + #define VFIO_IOMMU_FLAGS_MAP_ANY	(1 << 0)
> > +#define	VFIO_IOMMU_MAP_DMA		_IOWR(';', 106, struct vfio_dma_map)
> > +#define	VFIO_IOMMU_UNMAP_DMA		_IOWR(';', 107, struct vfio_dma_map)
> > +
> > +#define VFIO_DEVICE_GET_FLAGS		_IOR(';', 108, __u64)
> > + #define VFIO_DEVICE_FLAGS_PCI		(1 << 0)
> > + #define VFIO_DEVICE_FLAGS_DT		(1 << 1)
> > + #define VFIO_DEVICE_FLAGS_RESET	(1 << 2)
> > +#define VFIO_DEVICE_GET_NUM_REGIONS	_IOR(';', 109, int)
> > +
> > +struct vfio_region_info {
> > +	__u32	len;		/* length of structure */
> > +	__u32	index;		/* region number */
> > +	__u64	size;		/* size in bytes of region */
> > +	__u64	offset;		/* start offset of region */
> > +	__u64	flags;
> > +#define VFIO_REGION_INFO_FLAG_MMAP		(1 << 0)
> > +#define VFIO_REGION_INFO_FLAG_RO		(1 << 1)
> > +#define VFIO_REGION_INFO_FLAG_PHYS_VALID	(1 << 2)
> > +	__u64	phys;		/* physical address of region */
> > +};
> > +
> > +#define VFIO_DEVICE_GET_REGION_INFO	_IOWR(';', 110, struct vfio_region_info)
> > +
> > +#define VFIO_DEVICE_GET_NUM_IRQS	_IOR(';', 111, int)
> > +
> > +struct vfio_irq_info {
> > +	__u32	len;		/* length of structure */
> > +	__u32	index;		/* IRQ number */
> > +	__u32	count;		/* number of individual IRQs */
> > +	__u32	flags;
> > +#define VFIO_IRQ_INFO_FLAG_LEVEL		(1 << 0)
> > +};
> > +
> > +#define VFIO_DEVICE_GET_IRQ_INFO	_IOWR(';', 112, struct vfio_irq_info)
> > +
> > +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> > +#define VFIO_DEVICE_SET_IRQ_EVENTFDS	_IOW(';', 113, int)
> > +
> > +/* Unmask IRQ index, arg[0] = index */
> > +#define VFIO_DEVICE_UNMASK_IRQ		_IOW(';', 114, int)
> > +
> > +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> > +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD	_IOW(';', 115, int)
> > +
> > +#define VFIO_DEVICE_RESET		_IO(';', 116)
> > +
> > +struct vfio_dtpath {
> > +	__u32	len;		/* length of structure */
> > +	__u32	index;
> > +	__u64	flags;
> > +#define VFIO_DTPATH_FLAGS_REGION	(1 << 0)
> > +#define VFIO_DTPATH_FLAGS_IRQ		(1 << 1)
> > +	char	*path;
> > +};
> > +#define VFIO_DEVICE_GET_DTPATH		_IOWR(';', 117, struct vfio_dtpath)
> > +
> > +struct vfio_dtindex {
> > +	__u32	len;		/* length of structure */
> > +	__u32	index;
> > +	__u32	prop_type;
> > +	__u32	prop_index;
> > +	__u64	flags;
> > +#define VFIO_DTINDEX_FLAGS_REGION	(1 << 0)
> > +#define VFIO_DTINDEX_FLAGS_IRQ		(1 << 1)
> > +};
> > +#define VFIO_DEVICE_GET_DTINDEX		_IOWR(';', 118, struct vfio_dtindex)
> > +
> > +#endif /* VFIO_H */
> 
> 
> So where is the vfio-pci? Is that a seperate posting?

You can find it in the tree pointed to in the patch description:

https://github.com/awilliam/linux-vfio/commit/534725d327e2b7791a229ce72d2ae8a62ee0a4e5

I was hoping to get some consensus around the new core before spending
too much time polishing up the bus driver.  Thanks for the review, it's
very much appreciated!

Alex




More information about the iommu mailing list