[PATCH] iproute2: Add processless network namespace support.

Eric W. Biederman ebiederm at xmission.com
Wed Jun 29 14:27:12 PDT 2011


Ping.  Did you see this patch?


ebiederm at xmission.com (Eric W. Biederman) writes:

> The goal of this code change is to implement a mechanism such that it is
> simple to work with a kernel that is using multiple network namespaces
> at once.
>
> This comes in handy for interacting with vpns where there may be rfc1918
> address overlaps, and different policies default routes, name servers
> and the like.
>
> Configuration specific to a network namespace that would ordinarily be
> stored under /etc/ is stored under /etc/netns/<name>.  For example if
> the dns server configuration is different for your vpn you would create
> a file /etc/netns/myvpn/resolv.conf.
>
> File descriptors that can be used to manipulate a network namespace can
> be created by opening /var/run/netns/<NAME>.
>
> This adds the following commands to iproute.
> ip netns add NAME
> ip netns delete NAME
> ip netns monitor
> ip netns list
> ip netns exec NAME cmd ....
> ip link set DEV netns NAME
>
> ip netns exec exists to cater the vast majority of programs that only
> know how to operate in a single network namespace.  ip netns exec
> changes the default network namespace, creates a new mount namespace,
> remounts /sys and bind mounts netns specific configuration files to
> their standard locations.
>
> Signed-off-by: Eric W. Biederman <ebiederm at xmission.com>
> ---
>  include/linux/if_link.h |    1 +
>  ip/Makefile             |    2 +-
>  ip/ip.c                 |    4 +-
>  ip/ip_common.h          |    2 +
>  ip/iplink.c             |    8 +-
>  ip/ipnetns.c            |  314 +++++++++++++++++++++++++++++++++++++++++++++++
>  man/man8/ip.8           |   56 +++++++++
>  7 files changed, 383 insertions(+), 4 deletions(-)
>  create mode 100644 ip/ipnetns.c
>
> diff --git a/include/linux/if_link.h b/include/linux/if_link.h
> index e4a3a2d..304c44f 100644
> --- a/include/linux/if_link.h
> +++ b/include/linux/if_link.h
> @@ -136,6 +136,7 @@ enum {
>  	IFLA_PORT_SELF,
>  	IFLA_AF_SPEC,
>  	IFLA_GROUP,		/* Group the device belongs to */
> +	IFLA_NET_NS_FD,
>  	__IFLA_MAX
>  };
>  
> diff --git a/ip/Makefile b/ip/Makefile
> index 6054e8a..2ee4e7c 100644
> --- a/ip/Makefile
> +++ b/ip/Makefile
> @@ -1,4 +1,4 @@
> -IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o \
> +IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
>      rtm_map.o iptunnel.o ip6tunnel.o tunnel.o ipneigh.o ipntable.o iplink.o \
>      ipmaddr.o ipmonitor.o ipmroute.o ipprefix.o iptuntap.o \
>      ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o \
> diff --git a/ip/ip.c b/ip/ip.c
> index b127d57..7f0c468 100644
> --- a/ip/ip.c
> +++ b/ip/ip.c
> @@ -44,7 +44,8 @@ static void usage(void)
>  "Usage: ip [ OPTIONS ] OBJECT { COMMAND | help }\n"
>  "       ip [ -force ] -batch filename\n"
>  "where  OBJECT := { link | addr | addrlabel | route | rule | neigh | ntable |\n"
> -"                   tunnel | tuntap | maddr | mroute | mrule | monitor | xfrm }\n"
> +"                   tunnel | tuntap | maddr | mroute | mrule | monitor | xfrm |\n"
> +"                   netns }\n"
>  "       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n"
>  "                    -f[amily] { inet | inet6 | ipx | dnet | link } |\n"
>  "                    -l[oops] { maximum-addr-flush-attempts } |\n"
> @@ -80,6 +81,7 @@ static const struct cmd {
>  	{ "xfrm",	do_xfrm },
>  	{ "mroute",	do_multiroute },
>  	{ "mrule",	do_multirule },
> +	{ "netns",	do_netns },
>  	{ "help",	do_help },
>  	{ 0 }
>  };
> diff --git a/ip/ip_common.h b/ip/ip_common.h
> index a114186..5e5fb76 100644
> --- a/ip/ip_common.h
> +++ b/ip/ip_common.h
> @@ -38,6 +38,7 @@ extern int do_ipmonitor(int argc, char **argv);
>  extern int do_multiaddr(int argc, char **argv);
>  extern int do_multiroute(int argc, char **argv);
>  extern int do_multirule(int argc, char **argv);
> +extern int do_netns(int argc, char **argv);
>  extern int do_xfrm(int argc, char **argv);
>  
>  static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb)
> @@ -64,6 +65,7 @@ struct link_util
>  };
>  
>  struct link_util *get_link_kind(const char *kind);
> +int get_netns_fd(const char *name);
>  
>  #ifndef	INFINITY_LIFE_TIME
>  #define     INFINITY_LIFE_TIME      0xFFFFFFFFU
> diff --git a/ip/iplink.c b/ip/iplink.c
> index 48c0254..e5325a6 100644
> --- a/ip/iplink.c
> +++ b/ip/iplink.c
> @@ -67,6 +67,7 @@ void iplink_usage(void)
>  	fprintf(stderr, "	                  [ broadcast LLADDR ]\n");
>  	fprintf(stderr, "	                  [ mtu MTU ]\n");
>  	fprintf(stderr, "	                  [ netns PID ]\n");
> +	fprintf(stderr, "	                  [ netns NAME ]\n");
>  	fprintf(stderr, "			  [ alias NAME ]\n");
>  	fprintf(stderr, "	                  [ vf NUM [ mac LLADDR ]\n");
>  	fprintf(stderr, "				   [ vlan VLANID [ qos VLAN-QOS ] ]\n");
> @@ -304,9 +305,12 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req,
>                          NEXT_ARG();
>                          if (netns != -1)
>                                  duparg("netns", *argv);
> -                        if (get_integer(&netns, *argv, 0))
> +			if ((netns = get_netns_fd(*argv)) >= 0)
> +				addattr_l(&req->n, sizeof(*req), IFLA_NET_NS_FD, &netns, 4);
> +			else if (get_integer(&netns, *argv, 0) == 0)
> +				addattr_l(&req->n, sizeof(*req), IFLA_NET_NS_PID, &netns, 4);
> +			else
>                                  invarg("Invalid \"netns\" value\n", *argv);
> -                        addattr_l(&req->n, sizeof(*req), IFLA_NET_NS_PID, &netns, 4);
>  		} else if (strcmp(*argv, "multicast") == 0) {
>  			NEXT_ARG();
>  			req->i.ifi_change |= IFF_MULTICAST;
> diff --git a/ip/ipnetns.c b/ip/ipnetns.c
> new file mode 100644
> index 0000000..db7007c
> --- /dev/null
> +++ b/ip/ipnetns.c
> @@ -0,0 +1,314 @@
> +#define _ATFILE_SOURCE
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <sys/wait.h>
> +#include <sys/inotify.h>
> +#include <sys/mount.h>
> +#include <sys/param.h>
> +#include <sys/syscall.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <sched.h>
> +#include <fcntl.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <unistd.h>
> +
> +#include "utils.h"
> +#include "ip_common.h"
> +
> +#define NETNS_RUN_DIR "/var/run/netns"
> +#define NETNS_ETC_DIR "/etc/netns"
> +
> +#ifndef CLONE_NEWNET
> +#define CLONE_NEWNET 0x40000000	/* New network namespace (lo, device, names sockets, etc) */
> +#endif
> +
> +#ifndef MNT_DETACH
> +#define MNT_DETACH	0x00000002	/* Just detach from the tree */
> +#endif /* MNT_DETACH */
> +
> +static int setns(int fd, int nstype)
> +{
> +#ifdef __NR_setns
> +	return syscall(__NR_setns, fd, nstype);
> +#else
> +	errno = ENOSYS;
> +	return -1;
> +#endif
> +}
> +
> +
> +static int touch(const char *path, mode_t mode)
> +{
> +	int fd;
> +	fd = open(path, O_RDONLY|O_CREAT, mode);
> +	if (fd < 0)
> +		return -1;
> +	close(fd);
> +	return 0;
> +}
> +
> +static void usage(void) __attribute__((noreturn));
> +
> +static void usage(void)
> +{
> +	fprintf(stderr, "Usage: ip netns list\n");
> +	fprintf(stderr, "       ip netns add NAME\n");
> +	fprintf(stderr, "       ip netns delete NAME\n");
> +	fprintf(stderr, "       ip netns exec NAME cmd ...\n");
> +	fprintf(stderr, "       ip netns monitor\n");
> +	exit(-1);
> +}
> +
> +int get_netns_fd(const char *name)
> +{
> +	char pathbuf[MAXPATHLEN];
> +	const char *path, *ptr;
> +
> +	path = name;
> +	ptr = strchr(name, '/');
> +	if (!ptr) {
> +		snprintf(pathbuf, sizeof(pathbuf), "%s/%s",
> +			NETNS_RUN_DIR, name );
> +		path = pathbuf;
> +	}
> +	return open(path, O_RDONLY);
> +}
> +
> +static int netns_list(int argc, char **argv)
> +{
> +	struct dirent *entry;
> +	DIR *dir;
> +
> +	dir = opendir(NETNS_RUN_DIR);
> +	if (!dir)
> +		return 0;
> +
> +	while ((entry = readdir(dir)) != NULL) {
> +		if (strcmp(entry->d_name, ".") == 0)
> +			continue;
> +		if (strcmp(entry->d_name, "..") == 0)
> +			continue;
> +		printf("%s\n", entry->d_name);
> +	}
> +	closedir(dir);
> +	return 0;
> +}
> +
> +static void bind_etc(const char *name)
> +{
> +	char etc_netns_path[MAXPATHLEN];
> +	char netns_name[MAXPATHLEN];
> +	char etc_name[MAXPATHLEN];
> +	struct dirent *entry;
> +	DIR *dir;
> +
> +	snprintf(etc_netns_path, sizeof(etc_netns_path), "%s/%s", NETNS_ETC_DIR, name);
> +	dir = opendir(etc_netns_path);
> +	if (!dir)
> +		return;
> +
> +	while ((entry = readdir(dir)) != NULL) {
> +		if (strcmp(entry->d_name, ".") == 0)
> +			continue;
> +		if (strcmp(entry->d_name, "..") == 0)
> +			continue;
> +		snprintf(netns_name, sizeof(netns_name), "%s/%s", etc_netns_path, entry->d_name);
> +		snprintf(etc_name, sizeof(etc_name), "/etc/%s", entry->d_name);
> +		if (mount(netns_name, etc_name, "none", MS_BIND, NULL) < 0) {
> +			fprintf(stderr, "Bind %s -> %s failed: %s\n",
> +				netns_name, etc_name, strerror(errno));
> +		}
> +	}
> +	closedir(dir);
> +}
> +
> +static int netns_exec(int argc, char **argv)
> +{
> +	/* Setup the proper environment for apps that are not netns
> +	 * aware, and execute a program in that environment.
> +	 */
> +	const char *name, *cmd;
> +	char net_path[MAXPATHLEN];
> +	int netns;
> +
> +	if (argc < 1) {
> +		fprintf(stderr, "No netns name specified\n");
> +		return -1;
> +	}
> +	if (argc < 2) {
> +		fprintf(stderr, "No cmd specified\n");
> +		return -1;
> +	}
> +	name = argv[0];
> +	cmd = argv[1];
> +	snprintf(net_path, sizeof(net_path), "%s/%s", NETNS_RUN_DIR, name);
> +	netns = open(net_path, O_RDONLY);
> +	if (netns < 0) {
> +		fprintf(stderr, "Cannot open network namespace: %s\n",
> +			strerror(errno));
> +		return -1;
> +	}
> +	if (setns(netns, CLONE_NEWNET) < 0) {
> +		fprintf(stderr, "seting the network namespace failed: %s\n",
> +			strerror(errno));
> +		return -1;
> +	}
> +
> +	if (unshare(CLONE_NEWNS) < 0) {
> +		fprintf(stderr, "unshare failed: %s\n", strerror(errno));
> +		return -1;
> +	}
> +	/* Mount a version of /sys that describes the network namespace */
> +	if (umount2("/sys", MNT_DETACH) < 0) {
> +		fprintf(stderr, "umount of /sys failed: %s\n", strerror(errno));
> +		return -1;
> +	}
> +	if (mount(name, "/sys", "sysfs", 0, NULL) < 0) {
> +		fprintf(stderr, "mount of /sys failed: %s\n",strerror(errno));
> +		return -1;
> +	}
> +
> +	/* Setup bind mounts for config files in /etc */
> +	bind_etc(name);
> +
> +	if (execvp(cmd, argv + 1)  < 0)
> +		fprintf(stderr, "exec of %s failed: %s\n",
> +			cmd, strerror(errno));
> +	exit(-1);
> +}
> +
> +static int netns_delete(int argc, char **argv)
> +{
> +	const char *name;
> +	char netns_path[MAXPATHLEN];
> +
> +	if (argc < 1) {
> +		fprintf(stderr, "No netns name specified\n");
> +		return -1;
> +	}
> +
> +	name = argv[0];
> +	snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, name);
> +	umount2(netns_path, MNT_DETACH);
> +	if (unlink(netns_path) < 0) {
> +		fprintf(stderr, "Cannot remove %s: %s\n",
> +			netns_path, strerror(errno));
> +		return -1;
> +	}
> +	return 0;
> +}
> +
> +static int netns_add(int argc, char **argv)
> +{
> +	/* This function creates a new network namespace and
> +	 * a new mount namespace and bind them into a well known
> +	 * location in the filesystem based on the name provided.
> +	 *
> +	 * The mount namespace is created so that any necessary
> +	 * userspace tweaks like remounting /sys, or bind mounting
> +	 * a new /etc/resolv.conf can be shared between uers.
> +	 */
> +	char netns_path[MAXPATHLEN];
> +	const char *name;
> +
> +	if (argc < 1) {
> +		fprintf(stderr, "No netns name specified\n");
> +		return -1;
> +	}
> +	name = argv[0];
> +
> +	snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, name);
> +
> +	/* Create the base netns directory if it doesn't exist */
> +	mkdir(NETNS_RUN_DIR, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
> +
> +	/* Create the filesystem state */
> +	if (touch(netns_path, 0) < 0) {
> +		fprintf(stderr, "Could not create %s: %s\n",
> +			netns_path, strerror(errno));
> +		goto out_delete;
> +	}
> +	if (unshare(CLONE_NEWNET) < 0) {
> +		fprintf(stderr, "Failed to create a new network namespace: %s\n",
> +			strerror(errno));
> +		goto out_delete;
> +	}
> +
> +	/* Bind the netns last so I can watch for it */
> +	if (mount("/proc/self/ns/net", netns_path, "none", MS_BIND, NULL) < 0) {
> +		fprintf(stderr, "Bind /proc/self/ns/net -> %s failed: %s\n",
> +			netns_path, strerror(errno));
> +		goto out_delete;
> +	}
> +	return 0;
> +out_delete:
> +	netns_delete(argc, argv);
> +	exit(-1);
> +	return -1;
> +}
> +
> +
> +static int netns_monitor(int argc, char **argv)
> +{
> +	char buf[4096];
> +	struct inotify_event *event;
> +	int fd;
> +	fd = inotify_init();
> +	if (fd < 0) {
> +		fprintf(stderr, "inotify_init failed: %s\n",
> +			strerror(errno));
> +		return -1;
> +	}
> +	if (inotify_add_watch(fd, NETNS_RUN_DIR, IN_CREATE | IN_DELETE) < 0) {
> +		fprintf(stderr, "inotify_add_watch failed: %s\n",
> +			strerror(errno));
> +		return -1;
> +	}
> +	for(;;) {
> +		ssize_t len = read(fd, buf, sizeof(buf));
> +		if (len < 0) {
> +			fprintf(stderr, "read failed: %s\n",
> +				strerror(errno));
> +			return -1;
> +		}
> +		for (event = (struct inotify_event *)buf;
> +		     (char *)event < &buf[len];
> +		     event = (struct inotify_event *)((char *)event + sizeof(*event) + event->len)) {
> +			if (event->mask & IN_CREATE)
> +				printf("add %s\n", event->name);
> +			if (event->mask & IN_DELETE)
> +				printf("delete %s\n", event->name);
> +		}
> +	}
> +	return 0;
> +}
> +
> +int do_netns(int argc, char **argv)
> +{
> +	if (argc < 1)
> +		return netns_list(0, NULL);
> +
> +	if ((matches(*argv, "list") == 0) || (matches(*argv, "show") == 0) ||
> +	    (matches(*argv, "lst") == 0))
> +		return netns_list(argc-1, argv+1);
> +
> +	if (matches(*argv, "help") == 0)
> +		usage();
> +
> +	if (matches(*argv, "add") == 0)
> +		return netns_add(argc-1, argv+1);
> +
> +	if (matches(*argv, "delete") == 0)
> +		return netns_delete(argc-1, argv+1);
> +
> +	if (matches(*argv, "exec") == 0)
> +		return netns_exec(argc-1, argv+1);
> +
> +	if (matches(*argv, "monitor") == 0)
> +		return netns_monitor(argc-1, argv+1);
> +
> +	fprintf(stderr, "Command \"%s\" is unknown, try \"ip netns help\".\n", *argv);
> +	exit(-1);
> +}
> diff --git a/man/man8/ip.8 b/man/man8/ip.8
> index c5248ef..1935dc5 100644
> --- a/man/man8/ip.8
> +++ b/man/man8/ip.8
> @@ -85,6 +85,9 @@ ip \- show / manipulate routing, devices, policy routing and tunnels
>  .B  netns
>  .IR PID " |"
>  .br
> +.B  netns
> +.IR NETNSNAME " |"
> +.br
>  .B alias
>  .IR NAME  " |"
>  .br
> @@ -162,6 +165,17 @@ tentative " | " deprecated " | " dadfailed " | " temporary " ]"
>  .BR "ip addrlabel" " { " list " | " flush " }"
>  
>  .ti -8
> +.BR "ip netns" " { " list " | " monitor " } "
> +
> +.ti -8
> +.BR "ip netns" " { " add " | " delete " } "
> +.I NETNSNAME
> +
> +.ti -8
> +.BR "ip netns exec "
> +.I NETNSNAME command ...
> +
> +.ti -8
>  .BR "ip route" " { "
>  .BR list " | " flush " } "
>  .I  SELECTOR
> @@ -1006,6 +1020,11 @@ move the device to the network namespace associated with the process
>  .IR "PID".
>  
>  .TP
> +.BI netns " NETNSNAME"
> +move the device to the network namespace associated with name
> +.IR "NETNSNAME".
> +
> +.TP
>  .BI alias " NAME"
>  give the device a symbolic name for easy reference.
>  
> @@ -2470,6 +2489,43 @@ at any time.
>  It prepends the history with the state snapshot dumped at the moment
>  of starting.
>  
> +.SH ip netns - process network namespace management
> +
> +A network namespace is logically another copy of the network stack,
> +with it's own routes, firewall rules, and network devices.
> +
> +By convention a named network namespace is an object at
> +.BR "/var/run/netns/" NAME
> +that can be opened.  The file descriptor resulting from opening
> +.BR "/var/run/netns/" NAME 
> +refers to the specified network namespace.  Holding that file
> +descriptor open keeps the network namespace alive.  The file
> +descriptor can be used with the
> +.B setns(2)
> +system call to change the network namespace associated with a task.
> +
> +The convention for network namespace aware applications is to look
> +for global network configuration files first in
> +.BR "/etc/netns/" NAME "/"
> +then in
> +.BR "/etc/".
> +For example, if you want a different version of
> +.BR /etc/resolv.conf
> +for a network namespace used to isolate your vpn you would name it
> +.BR /etc/netns/myvpn/resolv.conf.
> +
> +.B ip netns exec
> +automates handling of this configuration, file convention for network
> +namespace unaware applications, by creating a mount namespace and
> +bind mounting all of the per network namespace configure files into
> +their traditional location in /etc.
> +
> +.SS ip netns list - show all of the named network namespaces
> +.SS ip netns monitor - report when network namespace names are created and destroyed
> +.SS ip netns add NAME - create a new named network namespace
> +.SS ip netns delete NAME - delete the name of a network namespace
> +.SS ip netns exec NAME cmd ... - Run cmd in the named network namespace
> +
>  .SH ip xfrm - setting xfrm
>  xfrm is an IP framework, which can transform format of the datagrams,
>  .br


More information about the Containers mailing list