[Bridge] [PATCH 3/3] net: macvtap driver

Michael S. Tsirkin mst at redhat.com
Thu Jan 28 09:34:38 PST 2010


On Wed, Jan 27, 2010 at 10:09:27PM +0100, Arnd Bergmann wrote:
> In order to use macvlan with qemu and other tools that require
> a tap file descriptor, the macvtap driver adds a small backend
> with a character device with the same interface as the tun
> driver, with a minimum set of features.
> 
> Macvtap interfaces are created in the same way as macvlan
> interfaces using ip link, but the netif is just used as a
> handle for configuration and accounting, while the data
> goes through the chardev. Each macvtap interface has its
> own character device, simplifying permission management
> significantly over the generic tun/tap driver.
> 
> Cc: Patrick McHardy <kaber at trash.net>
> Cc: Stephen Hemminger <shemminger at linux-foundation.org>
> Cc: David S. Miller" <davem at davemloft.net>
> Cc: "Michael S. Tsirkin" <mst at redhat.com>
> Cc: Herbert Xu <herbert at gondor.apana.org.au>
> Cc: Or Gerlitz <ogerlitz at voltaire.com>
> Cc: netdev at vger.kernel.org
> Cc: bridge at lists.linux-foundation.org
> Cc: linux-kernel at vger.kernel.org
> Signed-off-by: Arnd Bergmann <arnd at arndb.de>
> ---
>  drivers/net/Kconfig        |   12 +
>  drivers/net/Makefile       |    1 +
>  drivers/net/macvtap.c      |  572 ++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/if_macvlan.h |    1 +
>  4 files changed, 586 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/net/macvtap.c
> 
> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
> index cb0e534..411e207 100644
> --- a/drivers/net/Kconfig
> +++ b/drivers/net/Kconfig
> @@ -90,6 +90,18 @@ config MACVLAN
>  	  To compile this driver as a module, choose M here: the module
>  	  will be called macvlan.
>  
> +config MACVTAP
> +	tristate "MAC-VLAN based tap driver (EXPERIMENTAL)"
> +	depends on MACVLAN
> +	help
> +	  This adds a specialized tap character device driver that is based
> +	  on the MAC-VLAN network interface, called macvtap. A macvtap device
> +	  can be added in the same way as a macvlan device, using 'type
> +	  macvlan', and then be accessed through the tap user space interface.
> +
> +	  To compile this driver as a module, choose M here: the module
> +	  will be called macvtap.
> +
>  config EQUALIZER
>  	tristate "EQL (serial line load balancing) support"
>  	---help---
> diff --git a/drivers/net/Makefile b/drivers/net/Makefile
> index 0b763cb..9595803 100644
> --- a/drivers/net/Makefile
> +++ b/drivers/net/Makefile
> @@ -169,6 +169,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
>  obj-$(CONFIG_DUMMY) += dummy.o
>  obj-$(CONFIG_IFB) += ifb.o
>  obj-$(CONFIG_MACVLAN) += macvlan.o
> +obj-$(CONFIG_MACVTAP) += macvtap.o
>  obj-$(CONFIG_DE600) += de600.o
>  obj-$(CONFIG_DE620) += de620.o
>  obj-$(CONFIG_LANCE) += lance.o
> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> new file mode 100644
> index 0000000..2916202
> --- /dev/null
> +++ b/drivers/net/macvtap.c
> @@ -0,0 +1,572 @@
> +#include <linux/etherdevice.h>
> +#include <linux/if_macvlan.h>
> +#include <linux/interrupt.h>
> +#include <linux/nsproxy.h>
> +#include <linux/compat.h>
> +#include <linux/if_tun.h>
> +#include <linux/module.h>
> +#include <linux/skbuff.h>
> +#include <linux/cache.h>
> +#include <linux/sched.h>
> +#include <linux/types.h>
> +#include <linux/init.h>
> +#include <linux/wait.h>
> +#include <linux/cdev.h>
> +#include <linux/fs.h>
> +
> +#include <net/net_namespace.h>
> +#include <net/rtnetlink.h>
> +#include <net/sock.h>
> +
> +/*
> + * A macvtap queue is the central object of this driver, it connects
> + * an open character device to a macvlan interface. There can be
> + * multiple queues on one interface, which map back to queues
> + * implemented in hardware on the underlying device.
> + *
> + * macvtap_proto is used to allocate queues through the sock allocation
> + * mechanism.
> + *
> + * TODO: multiqueue support is currently not implemented, even though
> + * macvtap is basically prepared for that. We will need to add this
> + * here as well as in virtio-net and qemu to get line rate on 10gbit
> + * adapters from a guest.
> + */
> +struct macvtap_queue {
> +	struct sock sk;
> +	struct socket sock;
> +	struct macvlan_dev *vlan;
> +	struct file *file;
> +};
> +
> +static struct proto macvtap_proto = {
> +	.name = "macvtap",
> +	.owner = THIS_MODULE,
> +	.obj_size = sizeof (struct macvtap_queue),
> +};
> +
> +/*
> + * Minor number matches netdev->ifindex, so need a potentially
> + * large value. This also makes it possible to split the
> + * tap functionality out again in the future by offering it
> + * from other drivers besides macvtap. As long as every device
> + * only has one tap, the interface numbers assure that the
> + * device nodes are unique.
> + */
> +static unsigned int macvtap_major;
> +#define MACVTAP_NUM_DEVS 65536
> +static struct class *macvtap_class;
> +static struct cdev macvtap_cdev;
> +
> +/*
> + * RCU usage:
> + * The macvtap_queue is referenced both from the chardev struct file
> + * and from the struct macvlan_dev using rcu_read_lock.
> + *
> + * We never actually update the contents of a macvtap_queue atomically
> + * with RCU but it is used for race-free destruction of a queue when
> + * either the file or the macvlan_dev goes away. Pointers back to
> + * the dev and the file are implicitly valid as long as the queue
> + * exists.
> + *
> + * The callbacks from macvlan are always done with rcu_read_lock held
> + * already, while in the file_operations, we get it ourselves.
> + *
> + * When destroying a queue, we remove the pointers from the file and
> + * from the dev and then synchronize_rcu to make sure no thread is
> + * still using the queue. There may still be references to the struct
> + * sock inside of the queue from outbound SKBs, but these never
> + * reference back to the file or the dev. The data structure is freed
> + * through __sk_free when both our references and any pending SKBs
> + * are gone.
> + *
> + * macvtap_lock is only used to prevent multiple concurrent open()
> + * calls to assign a new vlan->tap pointer. It could be moved into
> + * the macvlan_dev itself but is extremely rarely used.
> + */
> +static DEFINE_SPINLOCK(macvtap_lock);
> +
> +/*
> + * Choose the next free queue, for now there is only one
> + */
> +static int macvtap_set_queue(struct net_device *dev, struct file *file,
> +				struct macvtap_queue *q)
> +{
> +	struct macvlan_dev *vlan = netdev_priv(dev);
> +	int err = -EBUSY;
> +
> +	spin_lock(&macvtap_lock);
> +	if (rcu_dereference(vlan->tap))
> +		goto out;
> +
> +	err = 0;
> +	q->vlan = vlan;
> +	rcu_assign_pointer(vlan->tap, q);
> +
> +	q->file = file;
> +	rcu_assign_pointer(file->private_data, q);
> +
> +out:
> +	spin_unlock(&macvtap_lock);
> +	return err;
> +}
> +
> +/*
> + * We must destroy each queue exactly once, when either
> + * the netdev or the file go away.
> + *
> + * Using the spinlock makes sure that we don't get
> + * to the queue again after destroying it.
> + *
> + * synchronize_rcu serializes with the packet flow
> + * that uses rcu_read_lock.
> + */
> +static void macvtap_del_queue(struct macvtap_queue **qp)
> +{
> +	struct macvtap_queue *q;
> +
> +	spin_lock(&macvtap_lock);
> +	q = rcu_dereference(*qp);
> +	if (!q) {
> +		spin_unlock(&macvtap_lock);
> +		return;
> +	}
> +
> +	rcu_assign_pointer(q->vlan->tap, NULL);
> +	rcu_assign_pointer(q->file->private_data, NULL);
> +	spin_unlock(&macvtap_lock);
> +
> +	synchronize_rcu();
> +	sock_put(&q->sk);
> +}
> +
> +/*
> + * Since we only support one queue, just dereference the pointer.
> + */
> +static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
> +					       struct sk_buff *skb)
> +{
> +	struct macvlan_dev *vlan = netdev_priv(dev);
> +
> +	return rcu_dereference(vlan->tap);
> +}
> +
> +static void macvtap_del_queues(struct net_device *dev)
> +{
> +	struct macvlan_dev *vlan = netdev_priv(dev);
> +	macvtap_del_queue(&vlan->tap);
> +}
> +
> +static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
> +{
> +	rcu_read_lock_bh();
> +	return rcu_dereference(file->private_data);
> +}
> +
> +static inline void macvtap_file_put_queue(void)
> +{
> +	rcu_read_unlock_bh();
> +}
> +

I find such wrappers around rcu obscure this,
already sufficiently complex, pattern.
Might be just me.

> +/*
> + * Forward happens for data that gets sent from one macvlan
> + * endpoint to another one in bridge mode. We just take
> + * the skb and put it into the receive queue.
> + */
> +static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
> +{
> +	struct macvtap_queue *q = macvtap_get_queue(dev, skb);
> +	if (!q)
> +		return -ENOLINK;
> +
> +	skb_queue_tail(&q->sk.sk_receive_queue, skb);
> +	wake_up(q->sk.sk_sleep);
> +	return 0;
> +}
> +
> +/*
> + * Receive is for data from the external interface (lowerdev),
> + * in case of macvtap, we can treat that the same way as
> + * forward, which macvlan cannot.
> + */
> +static int macvtap_receive(struct sk_buff *skb)
> +{
> +	skb_push(skb, ETH_HLEN);
> +	return macvtap_forward(skb->dev, skb);
> +}
> +
> +static int macvtap_newlink(struct net *src_net,
> +			   struct net_device *dev,
> +			   struct nlattr *tb[],
> +			   struct nlattr *data[])
> +{
> +	struct device *classdev;
> +	dev_t devt;
> +	int err;
> +
> +	err = macvlan_common_newlink(src_net, dev, tb, data,
> +				     macvtap_receive, macvtap_forward);
> +	if (err)
> +		goto out;
> +
> +	devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
> +
> +	classdev = device_create(macvtap_class, &dev->dev, devt,
> +				 dev, "tap%d", dev->ifindex);
> +	if (IS_ERR(classdev)) {
> +		err = PTR_ERR(classdev);
> +		macvtap_del_queues(dev);
> +	}
> +
> +out:
> +	return err;
> +}
> +
> +static void macvtap_dellink(struct net_device *dev,
> +			    struct list_head *head)
> +{
> +	device_destroy(macvtap_class,
> +		       MKDEV(MAJOR(macvtap_major), dev->ifindex));
> +
> +	macvtap_del_queues(dev);
> +	macvlan_dellink(dev, head);
> +}
> +
> +static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
> +	.kind		= "macvtap",
> +	.newlink	= macvtap_newlink,
> +	.dellink	= macvtap_dellink,
> +};
> +
> +
> +static void macvtap_sock_write_space(struct sock *sk)
> +{
> +	if (!sock_writeable(sk) ||
> +	    !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
> +		return;
> +
> +	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
> +		wake_up_interruptible_sync(sk->sk_sleep);
> +}
> +
> +static int macvtap_open(struct inode *inode, struct file *file)
> +{
> +	struct net *net = current->nsproxy->net_ns;
> +	struct net_device *dev = dev_get_by_index(net, iminor(inode));
> +	struct macvtap_queue *q;
> +	int err;
> +

This seems to keep reference to device as long as character device is
open, which, if I understand correctly, will start printing error
messages to kernel log about once a second if you try to remove the
device.

I suspect the best way to fix this issue would be to use some kind
of notifier so that macvtap can disconnect on device removal.


> +	err = -ENODEV;
> +	if (!dev)
> +		goto out;
> +
> +	/* check if this is a macvtap device */
> +	err = -EINVAL;
> +	if (dev->rtnl_link_ops != &macvtap_link_ops)
> +		goto out;
> +
> +	err = -ENOMEM;
> +	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
> +					     &macvtap_proto);
> +	if (!q)
> +		goto out;
> +
> +	init_waitqueue_head(&q->sock.wait);
> +	q->sock.type = SOCK_RAW;
> +	q->sock.state = SS_CONNECTED;
> +	sock_init_data(&q->sock, &q->sk);
> +	q->sk.sk_allocation = GFP_ATOMIC; /* for now */
> +	q->sk.sk_write_space = macvtap_sock_write_space;
> +
> +	err = macvtap_set_queue(dev, file, q);
> +	if (err)
> +		sock_put(&q->sk);
> +
> +out:
> +	return err;
> +}
> +
> +static int macvtap_release(struct inode *inode, struct file *file)
> +{
> +	macvtap_del_queue((struct macvtap_queue **)&file->private_data);
> +	return 0;
> +}
> +
> +static unsigned int macvtap_poll(struct file *file, poll_table * wait)
> +{
> +	struct macvtap_queue *q = macvtap_file_get_queue(file);
> +	unsigned int mask = POLLERR;
> +
> +	if (!q)
> +		goto out;
> +
> +	mask = 0;
> +	poll_wait(file, &q->sock.wait, wait);
> +
> +	if (!skb_queue_empty(&q->sk.sk_receive_queue))
> +		mask |= POLLIN | POLLRDNORM;
> +
> +	if (sock_writeable(&q->sk) ||
> +	    (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
> +	     sock_writeable(&q->sk)))
> +		mask |= POLLOUT | POLLWRNORM;
> +
> +out:
> +	macvtap_file_put_queue();
> +	return mask;
> +}
> +
> +/* Get packet from user space buffer */
> +static ssize_t macvtap_get_user(struct macvtap_queue *q,
> +				const struct iovec *iv, size_t count,
> +				int noblock)
> +{
> +	struct sk_buff *skb;
> +	size_t len = count;
> +	int err;
> +
> +	if (unlikely(len < ETH_HLEN))
> +		return -EINVAL;
> +
> +	skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
> +
> +	if (!skb) {
> +		macvlan_count_rx(q->vlan, 0, false, false);
> +		return err;
> +	}
> +
> +	skb_reserve(skb, NET_IP_ALIGN);
> +	skb_put(skb, count);
> +
> +	if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
> +		macvlan_count_rx(q->vlan, 0, false, false);
> +		kfree_skb(skb);
> +		return -EFAULT;
> +	}
> +
> +	skb_set_network_header(skb, ETH_HLEN);
> +
> +	macvlan_start_xmit(skb, q->vlan->dev);
> +
> +	return count;
> +}
> +

I am surprised there's no GSO support.  Would it be a good idea to share
code with tun driver? That already has GSO ...
Also, network header pointer seems off for vlan packets?

> +static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
> +				 unsigned long count, loff_t pos)
> +{
> +	struct file *file = iocb->ki_filp;
> +	ssize_t result = -ENOLINK;
> +	struct macvtap_queue *q = macvtap_file_get_queue(file);
> +
> +	if (!q)
> +		goto out;
> +
> +	result = macvtap_get_user(q, iv, iov_length(iv, count),
> +			      file->f_flags & O_NONBLOCK);
> +out:
> +	macvtap_file_put_queue();
> +	return result;
> +}
> +
> +/* Put packet to the user space buffer */
> +static ssize_t macvtap_put_user(struct macvtap_queue *q,
> +				const struct sk_buff *skb,
> +				const struct iovec *iv, int len)
> +{
> +	struct macvlan_dev *vlan = q->vlan;
> +	int ret;
> +
> +	len = min_t(int, skb->len, len);
> +
> +	ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
> +
> +	macvlan_count_rx(vlan, len, ret == 0, 0);
> +
> +	return ret ? ret : len;
> +}
> +
> +static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
> +				unsigned long count, loff_t pos)
> +{
> +	struct file *file = iocb->ki_filp;
> +	struct macvtap_queue *q = macvtap_file_get_queue(file);
> +
> +	DECLARE_WAITQUEUE(wait, current);
> +	struct sk_buff *skb;
> +	ssize_t len, ret = 0;
> +
> +	if (!q) {
> +		ret = -ENOLINK;
> +		goto out;
> +	}
> +
> +	len = iov_length(iv, count);
> +	if (len < 0) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	add_wait_queue(q->sk.sk_sleep, &wait);
> +	while (len) {
> +		current->state = TASK_INTERRUPTIBLE;
> +
> +		/* Read frames from the queue */
> +		skb = skb_dequeue(&q->sk.sk_receive_queue);
> +		if (!skb) {
> +			if (file->f_flags & O_NONBLOCK) {
> +				ret = -EAGAIN;
> +				break;
> +			}
> +			if (signal_pending(current)) {
> +				ret = -ERESTARTSYS;
> +				break;
> +			}
> +			/* Nothing to read, let's sleep */
> +			schedule();
> +			continue;
> +		}
> +		ret = macvtap_put_user(q, skb, iv, len);
> +		kfree_skb(skb);
> +		break;
> +	}
> +
> +	current->state = TASK_RUNNING;
> +	remove_wait_queue(q->sk.sk_sleep, &wait);
> +
> +out:
> +	macvtap_file_put_queue();
> +	return ret;
> +}
> +

Same GSO comment here.

> +/*
> + * provide compatibility with generic tun/tap interface
> + */
> +static long macvtap_ioctl(struct file *file, unsigned int cmd,
> +			  unsigned long arg)
> +{

All of these seem to be stubs, and tun has many more that you didn't
stub out. So, why do you bother to support any ioctls at all?

> +	struct macvtap_queue *q;
> +	void __user *argp = (void __user *)arg;
> +	struct ifreq __user *ifr = argp;
> +	unsigned int __user *up = argp;
> +	unsigned int u;
> +	int err;
> +
> +	switch (cmd) {
> +	case TUNSETIFF:
> +		/* ignore the name, just look at flags */
> +		if (get_user(u, &ifr->ifr_flags))
> +			return -EFAULT;
> +		if (u != (IFF_TAP | IFF_NO_PI))
> +			return -EINVAL;
> +		return 0;
> +
> +	case TUNGETIFF:
> +		q = macvtap_file_get_queue(file);
> +		if (!q)
> +			return -ENOLINK;
> +		err = 0;
> +		if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
> +		    put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
> +			err = -EFAULT;
> +		macvtap_file_put_queue();
> +		return err;
> +
> +	case TUNGETFEATURES:
> +		if (put_user((IFF_TAP | IFF_NO_PI), up))
> +			return -EFAULT;
> +		return 0;
> +
> +	case TUNSETSNDBUF:
> +		/* ignore */
> +		return 0;
> +
> +	case TUNSETOFFLOAD:
> +		/* let the user check for future flags */
> +		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> +			  TUN_F_TSO_ECN | TUN_F_UFO))
> +			return -EINVAL;
> +
> +		/* TODO: add support for these, so far we don't
> +			 support any offload */
> +		if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> +			 TUN_F_TSO_ECN | TUN_F_UFO))
> +			return -EINVAL;
> +
> +		return 0;
> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
> +				 unsigned long arg)
> +{
> +	return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
> +}
> +#endif
> +
> +static const struct file_operations macvtap_fops = {
> +	.owner		= THIS_MODULE,
> +	.open		= macvtap_open,
> +	.release	= macvtap_release,
> +	.aio_read	= macvtap_aio_read,
> +	.aio_write	= macvtap_aio_write,
> +	.poll		= macvtap_poll,
> +	.llseek		= no_llseek,
> +	.unlocked_ioctl	= macvtap_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= macvtap_compat_ioctl,
> +#endif
> +};
> +
> +static int macvtap_init(void)
> +{
> +	int err;
> +
> +	err = alloc_chrdev_region(&macvtap_major, 0,
> +				MACVTAP_NUM_DEVS, "macvtap");
> +	if (err)
> +		goto out1;
> +
> +	cdev_init(&macvtap_cdev, &macvtap_fops);
> +	err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
> +	if (err)
> +		goto out2;
> +
> +	macvtap_class = class_create(THIS_MODULE, "macvtap");
> +	if (IS_ERR(macvtap_class)) {
> +		err = PTR_ERR(macvtap_class);
> +		goto out3;
> +	}
> +
> +	err = macvlan_link_register(&macvtap_link_ops);
> +	if (err)
> +		goto out4;
> +
> +	return 0;
> +
> +out4:
> +	class_unregister(macvtap_class);
> +out3:
> +	cdev_del(&macvtap_cdev);
> +out2:
> +	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
> +out1:
> +	return err;
> +}
> +module_init(macvtap_init);
> +
> +static void macvtap_exit(void)
> +{
> +	rtnl_link_unregister(&macvtap_link_ops);
> +	class_unregister(macvtap_class);
> +	cdev_del(&macvtap_cdev);
> +	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
> +}
> +module_exit(macvtap_exit);
> +
> +MODULE_ALIAS_RTNL_LINK("macvtap");
> +MODULE_AUTHOR("Arnd Bergmann <arnd at arndb.de>");
> +MODULE_LICENSE("GPL");
> diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
> index 9a11544..51f1512 100644
> --- a/include/linux/if_macvlan.h
> +++ b/include/linux/if_macvlan.h
> @@ -34,6 +34,7 @@ struct macvlan_dev {
>  	enum macvlan_mode	mode;
>  	int (*receive)(struct sk_buff *skb);
>  	int (*forward)(struct net_device *dev, struct sk_buff *skb);
> +	struct macvtap_queue	*tap;
>  };
>  
>  static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
> -- 
> 1.6.3.3


More information about the Bridge mailing list