[PATCH 11/15] Signal semantics

Pavel Emelyanov xemul at openvz.org
Thu Jul 26 07:55:03 PDT 2007


From: Sukadev Bhattiprolu <sukadev at us.ibm.com>

With support for multiple pid namespaces, each pid namespace has a
separate child reaper and this process needs some special handling
of signals. 

	- The child reaper should appear like a normal process to other
	  processes in its ancestor namespaces and so should be killable
	  (or not) in the usual way.

       	- The child reaper should receive, from processes in it's active
          and decendent namespaces, only those signals for which it has
          installed a signal handler.

	- System-wide signals (eg: kill signum -1) from within a child namespace
	  should only affect processes within that namespace and descendant
	  namespaces. They should not be posted to processes in ancestor or
	  sibling namespaces.

	- If the sender of a signal does not have a pid_t in the receiver's
	  namespace (eg: a process in init_pid_ns sends a signal to a process
	  in a descendant namespace), the sender's pid should appear as 0
	  in the signal's siginfo structure.

	- Existing rules for SIGIO delivery still apply and a process can
	  choose any other process in its namespace and descendant namespaces
	  to receive the SIGIO signal.
	  
	  The following appears to be incorrect in the fcntl() man page for
	  F_SETOWN.

              Sending a signal to  the  owner  process  (group)  specified  by
              F_SETOWN  is  subject  to  the  same  permissions  checks as are
              described for kill(2), where the sending process is the one that
              employs F_SETOWN (but see BUGS below).


	Current behavior is that the SIGIO signal is delivered on behalf of
	the process that caused the event (eg: made data available on the
	file) and not the process that called fcntl().

To implement the above requirements, we:

	- Add a check in check_kill_permission() for a process within a
	  namespace sending the fast-pathed, SIGKILL signal.

	- We use a flag, SIGQUEUE_CINIT, to tell the container-init if
	  a signal posted to its queue is from a process within its own
	  namespace. The flag is set in send_signal() if a process attempts
	  to send a signal to its container-init.

	  The SIGQUEUE_CINIT flag is checked in collect_signal() - if
	  the flag is set,  collect_signal() sets the KERN_SIGINFO_CINIT
	  flag in the kern_siginfo. The KERN_SIGINFO_CINIT flag indicates
	  that the sender is from within the namespace and the container-init
	  can choose to ignore the signal.
	  
	  If the KERN_SIGINFO_CINIT flag is clear in get_signal_to_deliver(),
	  the signal originated from an ancestor namespace and so the
	  container-init honors the signal.


Note: We currently use two flags, SIGQUEUE_CINIT, KERN_SIGINFO_CINIT to
	avoid modifying 'struct sigqueue'. If 'kern_siginfo' approach is
	feasible, we could use 'kern_siginfo' in sigqueue and eliminate
	SIGQUEUE_CINIT.

Signed-off-by: Sukadev Bhattiprolu <sukadev at us.ibm.com>
Signed-off-by: Pavel Emelyanov <xemul at openvz.org>

---

 include/linux/pid.h    |    3 ++
 include/linux/signal.h |    1 
 kernel/pid.c           |   46 +++++++++++++++++++++++++++++++++++
 kernel/signal.c        |   63 ++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 112 insertions(+), 1 deletion(-)

diff -upr linux-2.6.23-rc1-mm1.orig/include/linux/pid.h linux-2.6.23-rc1-mm1-7/include/linux/pid.h
--- linux-2.6.23-rc1-mm1.orig/include/linux/pid.h	2007-07-26 16:34:45.000000000 +0400
+++ linux-2.6.23-rc1-mm1-7/include/linux/pid.h	2007-07-26 16:36:37.000000000 +0400
@@ -71,6 +77,9 @@ extern struct task_struct *FASTCALL(pid_
 extern struct task_struct *FASTCALL(get_pid_task(struct pid *pid,
 						enum pid_type));
 
+extern int task_visible_in_pid_ns(struct task_struct *tsk,
+					struct pid_namespace *ns);
+extern int pid_ns_equal(struct task_struct *tsk);
 extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
 
 /*
diff -upr linux-2.6.23-rc1-mm1.orig/include/linux/signal.h linux-2.6.23-rc1-mm1-7/include/linux/signal.h
--- linux-2.6.23-rc1-mm1.orig/include/linux/signal.h	2007-07-26 16:34:45.000000000 +0400
+++ linux-2.6.23-rc1-mm1-7/include/linux/signal.h	2007-07-26 16:36:37.000000000 +0400
@@ -20,6 +27,7 @@ struct sigqueue {
 
 /* flags values. */
 #define SIGQUEUE_PREALLOC	1
+#define SIGQUEUE_CINIT		2
 
 struct sigpending {
 	struct list_head list;
diff -upr linux-2.6.23-rc1-mm1.orig/kernel/pid.c linux-2.6.23-rc1-mm1-7/kernel/pid.c
--- linux-2.6.23-rc1-mm1.orig/kernel/pid.c	2007-07-26 16:34:45.000000000 +0400
+++ linux-2.6.23-rc1-mm1-7/kernel/pid.c	2007-07-26 16:36:37.000000000 +0400
@@ -318,6 +355,52 @@ struct task_struct * fastcall pid_task(s
 }
 
 /*
+ * Return TRUE if the task @p is visible in the pid namespace @ns
+ *
+ * Note: @p is visible in @ns if the active-pid-ns of @p is either equal to
+ * 	@ns or is a descendant of @ns.
+ *
+ * 	@p is not visible in @ns if active-pid-ns of @p is an ancestor of @ns.
+ * 	Eg: Processes in init-pid-ns are not visible in child pid namespaces.
+ * 	They should not receive any system-wide signals from a child-pid-
+ * 	namespace for instance.
+ */
+int task_visible_in_pid_ns(struct task_struct *p, struct pid_namespace *ns)
+{
+	int i;
+	struct pid *pid = task_pid(p);
+
+	if (!pid)
+		return 0;
+
+	for (i = 0; i <= pid->level; i++) {
+		if (pid->numbers[i].ns == ns)
+			return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(task_visible_in_pid_ns);
+
+/*
+ * Return TRUE if the active pid namespace of @tsk is same as active
+ * pid namespace of 'current'
+ */
+
+static inline struct pid_namespace *pid_active_ns(struct pid *pid)
+{
+	if (pid == NULL)
+		return NULL;
+
+	return pid->numbers[pid->level].ns;
+}
+
+int pid_ns_equal(struct task_struct *tsk)
+{
+	return pid_active_ns(task_pid(tsk)) == pid_active_ns(task_pid(current));
+}
+
+/*
  * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
 struct task_struct *find_task_by_pid_type_ns(int type, int nr,
diff -upr linux-2.6.23-rc1-mm1.orig/kernel/signal.c linux-2.6.23-rc1-mm1-7/kernel/signal.c
--- linux-2.6.23-rc1-mm1.orig/kernel/signal.c	2007-07-26 16:34:45.000000000 +0400
+++ linux-2.6.23-rc1-mm1-7/kernel/signal.c	2007-07-26 16:36:37.000000000 +0400
@@ -323,6 +325,9 @@ static int collect_signal(int sig, struc
 	if (first) {
 		list_del_init(&first->list);
 		copy_siginfo(info, &first->info);
+		if (first->flags & SIGQUEUE_CINIT)
+			kinfo->flags |= KERN_SIGINFO_CINIT;
+
 		__sigqueue_free(first);
 		if (!still_pending)
 			sigdelset(&list->signal, sig);
@@ -343,6 +348,8 @@ static int collect_signal(int sig, struc
 {
 	int sig = next_signal(pending, mask);
 
+	kinfo->flags &= ~KERN_SIGINFO_CINIT;
+
 	if (sig) {
 		if (current->notifier) {
 			if (sigismember(current->notifier_mask, sig)) {
@@ -522,6 +547,20 @@ static int rm_from_queue(unsigned long m
 	return 1;
 }
 
+static int deny_signal_to_container_init(struct task_struct *tsk, int sig)
+{
+	/*
+	 * If receiver is the container-init of sender and signal is SIGKILL
+	 * reject it right-away. If signal is any other one, let the container
+	 * init decide (in get_signal_to_deliver()) whether to handle it or
+	 * ignore it.
+	 */
+	if (is_container_init(tsk) && (sig == SIGKILL) && pid_ns_equal(tsk))
+		return -EPERM;
+
+	return 0;
+}
+
 /*
  * Bad permissions for sending the signal
  */
@@ -545,6 +584,10 @@ static int check_kill_permission(int sig
 	    && !capable(CAP_KILL))
 		return error;
 
+	error = deny_signal_to_container_init(t, sig);
+	if (error)
+		return error;
+
 	return security_task_kill(t, info, sig, 0);
 }
 
@@ -659,6 +702,34 @@ static void handle_stop_signal(int sig, 
 	}
 }
 
+static void encode_sender_info(struct task_struct *t, struct sigqueue *q)
+{
+	/*
+	 * If sender (i.e 'current') and receiver have the same active
+	 * pid namespace and the receiver is the container-init, set the
+	 * SIGQUEUE_CINIT flag. This tells the container-init that the
+	 * signal originated in its own namespace and so it can choose
+	 * to ignore the signal.
+	 *
+	 * If the receiver is the container-init of a pid namespace,
+	 * but the sender is from an ancestor pid namespace, the
+	 * container-init cannot ignore the signal. So clear the
+	 * SIGQUEUE_CINIT flag in this case.
+	 *
+	 * Also, if the sender does not have a pid_t in the receiver's
+	 * active pid namespace, set si_pid to 0 and pretend it originated
+	 * from the kernel.
+	 */
+	if (pid_ns_equal(t)) {
+		if (is_container_init(t)) {
+			q->flags |= SIGQUEUE_CINIT;
+		}
+	} else {
+		q->info.si_pid = 0;
+		q->info.si_code = SI_KERNEL;
+	}
+}
+
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			struct sigpending *signals)
 {
@@ -710,6 +781,7 @@ static int send_signal(int sig, struct s
 			copy_siginfo(&q->info, info);
 			break;
 		}
+		encode_sender_info(t, q);
 	} else if (!is_si_special(info)) {
 		if (sig >= SIGRTMIN && info->si_code != SI_USER)
 		/*
@@ -1149,6 +1221,8 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
 static int kill_something_info(int sig, struct siginfo *info, int pid)
 {
 	int ret;
+	struct pid_namespace *my_ns = task_active_pid_ns(current);
+
 	rcu_read_lock();
 	if (!pid) {
 		ret = kill_pgrp_info(sig, info, task_pgrp(current));
@@ -1158,6 +1232,13 @@ static int kill_something_info(int sig, 
 
 		read_lock(&tasklist_lock);
 		for_each_process(p) {
+			/*
+			 * System-wide signals apply only to the sender's
+			 * pid namespace, unless issued from init_pid_ns.
+			 */
+			if (!task_visible_in_pid_ns(p, my_ns))
+				continue;
+
 			if (p->pid > 1 && p->tgid != current->tgid) {
 				int err = group_send_sig_info(sig, info, p);
 				++count;
@@ -1852,7 +1950,7 @@ relock:
 		 * within that pid space. It can of course get signals from
 		 * its parent pid space.
 		 */
-		if (current == task_child_reaper(current))
+		if (kinfo.flags & KERN_SIGINFO_CINIT)
 			continue;
 
 		if (sig_kernel_stop(signr)) {



More information about the Containers mailing list