[Bridge] [PATCH net-next v1 2/2] bridge: export multicast database via netlink

Thomas Graf tgraf at suug.ch
Fri Nov 30 11:26:18 UTC 2012


On 11/30/12 at 05:58pm, Cong Wang wrote:
> +enum {
> +	MDBA_UNSPEC,
> +	MDBA_MDB,
> +	MDBA_ROUTER,
> +	__MDBA_MAX,
> +};
> +#define MDBA_MAX (__MDBA_MAX - 1)
> +
> +enum {
> +	MDBA_MDB_UNSPEC,
> +	MDBA_MCADDR,
> +	MDBA_BRPORT_NO,
> +	__MDBA_MDB_MAX,
> +};
> +#define MDBA_MDB_MAX (__MDBA_MDB_MAX - 1)

So juse use these enums from iproute2 directly instead redefining
them.

> diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
> new file mode 100644
> index 0000000..3751f7d
> --- /dev/null
> +++ b/net/bridge/br_mdb.c
> @@ -0,0 +1,166 @@
> +#include <linux/err.h>
> +#include <linux/if_ether.h>
> +#include <linux/igmp.h>
> +#include <linux/kernel.h>
> +#include <linux/netdevice.h>
> +#include <linux/rculist.h>
> +#include <linux/skbuff.h>
> +#include <linux/slab.h>
> +#include <net/ip.h>
> +#if IS_ENABLED(CONFIG_IPV6)
> +#include <net/ipv6.h>
> +#include <net/mld.h>
> +#include <net/addrconf.h>
> +#include <net/ip6_checksum.h>
> +#endif
> +
> +#include "br_private.h"
> +
> +struct br_port_msg {
> +	int ifindex;
> +};

Make that __u32 ifindex and move the definition into if_bridge.h so
you can reuse it in iproute2.

> +static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
> +			       u32 seq, struct net_device *dev)
> +{
> +	struct net_bridge *br = netdev_priv(dev);
> +	struct net_bridge_port *p;
> +	struct hlist_node *n;
> +	struct nlattr *nest;
> +
> +	if (!br->multicast_router || hlist_empty(&br->router_list)) {
> +		printk(KERN_INFO "no router on bridge\n");
> +		return 0;
> +	}
> +
> +	nest = nla_nest_start(skb, MDBA_ROUTER);
> +	if (nest == NULL)
> +		return -EMSGSIZE;
> +
> +	hlist_for_each_entry_rcu(p, n, &br->router_list, rlist) {
> +		if (p && nla_put_u16(skb, MDBA_BRPORT_NO, p->port_no))
> +			goto fail;
> +	}

port_no 0 is reserved, right?

We can reduce message size here and make it easier to extend by
using p->port_no as the attribute id by doing something like this:

hlist_for_each_entry_rcu(p, n, &br->router_list, rlist)
	if (nla_put_flag(skb, p->port_no))
		goto fail;

This will result in an empty attribute body for now and if you ever
need to include more data you can simply start putting attributes
insde that empty body and old users will continue to function.

> +static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
> +			    u32 seq, struct net_device *dev)
> +{
> +	struct net_bridge *br = netdev_priv(dev);
> +	struct net_bridge_mdb_htable *mdb;
> +	struct nlattr *nest;
> +	unsigned int i;
> +	int s_idx;
> +
> +	if (br->multicast_disabled) {
> +		printk(KERN_INFO "multicast is disabled on bridge\n");
> +		return 0;
> +	}
> +
> +	mdb = rcu_dereference(br->mdb);
> +	if (!mdb) {
> +		printk(KERN_INFO "no mdb on bridge\n");
> +		return 0;
> +	}
> +
> +	cb->seq = mdb->seq;

I'm not sure how this is supposed to worl. cb->seq may not change
throughout the complete dump process or the dump will be interrupted
and the user is required to restart.

Each bridge will have its own mdb resulting in a differing seq.

> +	s_idx = cb->args[1];
> +	if (s_idx >= mdb->max)
> +		return 0;
> +
> +	nest = nla_nest_start(skb, MDBA_MDB);
> +	if (nest == NULL)
> +		return -EMSGSIZE;
> +
> +	for (i = s_idx; i < mdb->max; i++) {
> +		struct hlist_node *h;
> +		struct net_bridge_mdb_entry *mp;
> +		struct net_bridge_port_group *p, **pp;
> +		struct net_bridge_port *port;
> +
> +		hlist_for_each_entry_rcu(mp, h, &mdb->mhash[i], hlist[mdb->ver]) {
> +			for (pp = &mp->ports;
> +			     (p = rcu_dereference(*pp)) != NULL;
> +			      pp = &p->next) {
> +				port = p->port;
> +				if (port) {
> +					printk(KERN_INFO "port %u, mcaddr: %pI4\n", port->port_no, &mp->addr.u.ip4);
> +					if (nla_put_u16(skb, MDBA_BRPORT_NO, port->port_no) ||
> +					    nla_put_be32(skb, MDBA_MCADDR, p->addr.u.ip4))
> +						goto fail;
> +				}
> +			}
> +		}
> +	}
> +
> +	cb->args[1] = i;
> +	nla_nest_end(skb, nest);
> +	return 0;
> +fail:
> +	cb->args[1] = i;
> +	nla_nest_cancel(skb, nest);
> +	return -EMSGSIZE;

This still relies on the assumption that all of mdb->mhash[] will fit
into a single netlink message. Is that guaranteed? If so that would
simplify this a lot and you wouldn't have to worry about rehashes at
all.

As-is it looks broken, you store an offset into the hash but if you
run out of space you trim away the complete nested attribute again
and thus offseting will actually result in data loss because that
data has never been wired but you will skip over it next time.

You need something like this:
        for (i = 0; i < mdb->max; i++) {
                [...]
                hlist_for_each_entry_rcu(mp, h, &mdb->mhash[i], hlist[mdb->ver]) {
                        if (idx < s_idx)
                                goto skip;

                        hentry = nla_nest_start(...)
                        if (nla_put_u16(...) ||
                            nla_put_32(...)) {
                                nla_nest_cancel(skb, hentry);
				err = -EMSGSIZE;
                                goto out;
                        }
                        nla_nest_end(skb, hentry);
                skip:
                        idx++;
                }
        }

out:
	cb->args[1] = idx;
	nla_nest_end(skb, nest);
	return err;

> +}
> +
> +static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
> +{
> +	struct net_device *dev;
> +	struct net *net = sock_net(skb->sk);
> +	struct nlmsghdr *nlh;
> +	u32 seq = cb->nlh->nlmsg_seq;
> +	int idx = 0, s_idx;
> +
> +	s_idx = cb->args[0];
> +
> +	rcu_read_lock();
> +
> +	for_each_netdev_rcu(net, dev) {
> +		if (dev->priv_flags & IFF_EBRIDGE) {
> +			struct br_port_msg *bpm;
> +
> +			if (idx < s_idx)
> +				goto cont;
> +
> +			nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
> +					seq, RTM_GETMDB,
> +					sizeof(*bpm), NLM_F_MULTI);
> +			if (nlh == NULL)
> +				break;
> +
> +			bpm = nlmsg_data(nlh);
> +			bpm->ifindex = dev->ifindex;
> +			if (br_mdb_fill_info(skb, cb, seq, dev) < 0) {
> +				printk(KERN_INFO "br_mdb_fill_info failed\n");
> +				goto fail;
> +			}
> +			if (br_rports_fill_info(skb, cb, seq, dev) < 0) {
> +				printk(KERN_INFO "br_rports_fill_info failed\n");
> +				goto fail;
> +			}
> +
> +			nlmsg_end(skb, nlh);
> +cont:
> +			idx++;
> +		}
> +	}
> +
> +	rcu_read_unlock();
> +	cb->args[0] = idx;
> +	return skb->len;
> +
> +fail:
> +	rcu_read_unlock();
> +	nlmsg_cancel(skb, nlh);
> +	return skb->len;

Assuming you implement partial messages as proposed above you don't
want to nlmsg_cancel() here. Instead you just want to nlmsg_end() and
send out the message with a partial MDB_MDBA and continue where you
left off.


More information about the Bridge mailing list