[Openais] [PATCH corosync trunk] IPC: gracefully handle running out of file descriptors.

Angus Salkeld asalkeld at redhat.com
Sun Aug 8 18:28:44 PDT 2010


Whenever we accept a new connection or close an
existing one, check the number of available file
descriptors and either publish or withdraw the
IPC listening socket.

Signed-off-by: Angus Salkeld <asalkeld at redhat.com>
---
 exec/coroipcs.c |  104 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 87 insertions(+), 17 deletions(-)

diff --git a/exec/coroipcs.c b/exec/coroipcs.c
index aca9053..05b6a17 100644
--- a/exec/coroipcs.c
+++ b/exec/coroipcs.c
@@ -169,6 +169,14 @@ struct conn_info {
 	int poll_state;
 };
 
+static int32_t coro_server_fd = -1;
+
+static void server_socket_publish(void);
+
+static void server_socket_withdraw(void);
+
+static void server_socket_check(void);
+
 static int shared_mem_dispatch_bytes_left (const struct conn_info *conn_info);
 
 static void outq_flush (struct conn_info *conn_info);
@@ -180,8 +188,6 @@ static void ipc_disconnect (struct conn_info *conn_info);
 static void msg_send (void *conn, const struct iovec *iov, unsigned int iov_len,
 		      int locked);
 
-static void _corosync_ipc_init(void);
-
 #define log_printf(level, format, args...) \
 do { \
 	if (api->log_printf) \
@@ -513,6 +519,7 @@ static inline int conn_info_destroy (struct conn_info *conn_info)
 		conn_info->state == CONN_STATE_DISCONNECT_INACTIVE) {
 		list_del (&conn_info->list);
 		close (conn_info->fd);
+		server_socket_check();
 		api->free (conn_info);
 		return (-1);
 	}
@@ -568,6 +575,7 @@ static inline int conn_info_destroy (struct conn_info *conn_info)
 		api->free (conn_info->private_data);
 	}
 	close (conn_info->fd);
+	server_socket_check();
 	res = circular_memory_unmap (conn_info->dispatch_buffer, conn_info->dispatch_size);
 	zcb_all_free (conn_info);
 	api->free (conn_info);
@@ -959,7 +967,7 @@ extern void coroipcs_ipc_init_v2 (
 	api->old_log_printf	= NULL;
 
 	log_printf (LOGSYS_LEVEL_DEBUG, "you are using ipc api v2\n");
-	_corosync_ipc_init ();
+	server_socket_publish();
 }
 
 extern void coroipcs_ipc_init (
@@ -997,34 +1005,69 @@ extern void coroipcs_ipc_init (
 
 	log_printf (LOGSYS_LEVEL_DEBUG, "you are using ipc api v1\n");
 
-	_corosync_ipc_init ();
+	server_socket_publish();
 }
 
-static void _corosync_ipc_init(void)
+/*
+ * The actual used sockets is 12 but allowing a larger number
+ * for safety.
+ */
+#define COROIPC_NUM_RESERVED_SOCKETS 25
+
+static int32_t num_avail_sockets(void)
 {
-	int server_fd;
+	struct rlimit lim;
+	int32_t open_socks = 0;
+	int32_t res;
+	struct list_head *list;
+
+	if (getrlimit(RLIMIT_NOFILE, &lim) == -1) {
+		char error_str[100];
+		strerror_r(errno, error_str, 100);
+		log_printf(LOGSYS_LEVEL_ERROR,
+			"getrlimit: %s\n", error_str);
+		return -1;
+	}
+
+	for (list = conn_info_list_head.next; list != &conn_info_list_head;
+		list = list->next) {
+		open_socks++;
+	}
+	res = (lim.rlim_cur - (open_socks + COROIPC_NUM_RESERVED_SOCKETS));
+	log_printf(LOGSYS_LEVEL_DEBUG, "(lim.rlim_cur:%d - (open_socks:%d + reserved:%d) == %d\n",
+		lim.rlim_cur, open_socks, COROIPC_NUM_RESERVED_SOCKETS, res);
+	return res;
+}
+
+static void server_socket_publish(void)
+{
+	int32_t res = 0;
 	struct sockaddr_un un_addr;
-	int res;
+
+	log_printf (LOGSYS_LEVEL_WARNING,
+		"Publishing socket for client connections.\n");
 
 	/*
 	 * Create socket for IPC clients, name socket, listen for connections
 	 */
 #if defined(COROSYNC_SOLARIS)
-	server_fd = socket (PF_UNIX, SOCK_STREAM, 0);
+	coro_server_fd = socket (PF_UNIX, SOCK_STREAM, 0);
 #else
-	server_fd = socket (PF_LOCAL, SOCK_STREAM, 0);
+	coro_server_fd = socket (PF_LOCAL, SOCK_STREAM, 0);
 #endif
-	if (server_fd == -1) {
+	if (coro_server_fd == -1) {
 		log_printf (LOGSYS_LEVEL_CRIT, "Cannot create client connections socket.\n");
 		api->fatal_error ("Can't create library listen socket");
 	}
 
-	res = fcntl (server_fd, F_SETFL, O_NONBLOCK);
+	res = fcntl (coro_server_fd, F_SETFL, O_NONBLOCK);
 	if (res == -1) {
 		char error_str[100];
-		strerror_r (errno, error_str, 100);
-		log_printf (LOGSYS_LEVEL_CRIT, "Could not set non-blocking operation on server socket: %s\n", error_str);
-		api->fatal_error ("Could not set non-blocking operation on server socket");
+		strerror_r(errno, error_str, 100);
+		log_printf(LOGSYS_LEVEL_CRIT,
+			"Could not set non-blocking operation on server socket: %s\n",
+			error_str);
+		api->fatal_error("Could not set non-blocking operation on server socket");
 	}
 
 	memset (&un_addr, 0, sizeof (struct sockaddr_un));
@@ -1048,7 +1091,7 @@ static void _corosync_ipc_init(void)
 	}
 #endif
 
-	res = bind (server_fd, (struct sockaddr *)&un_addr, COROSYNC_SUN_LEN(&un_addr));
+	res = bind (coro_server_fd, (struct sockaddr *)&un_addr, COROSYNC_SUN_LEN(&un_addr));
 	if (res) {
 		char error_str[100];
 		strerror_r (errno, error_str, 100);
@@ -1063,12 +1106,35 @@ static void _corosync_ipc_init(void)
 #if !defined(COROSYNC_LINUX)
 	res = chmod (un_addr.sun_path, S_IRWXU|S_IRWXG|S_IRWXO);
 #endif
-	listen (server_fd, SERVER_BACKLOG);
+	listen (coro_server_fd, SERVER_BACKLOG);
 
 	/*
 	 * Setup connection dispatch routine
 	 */
-	api->poll_accept_add (server_fd);
+	api->poll_accept_add (coro_server_fd);
+}
+
+static void server_socket_withdraw(void)
+{
+	log_printf(LOGSYS_LEVEL_WARNING,
+		"Withdrawing socket for client connections.\n");
+
+	api->poll_dispatch_destroy(coro_server_fd, NULL);
+	shutdown(coro_server_fd, SHUT_RDWR);
+	close(coro_server_fd);
+	coro_server_fd = -1;
+}
+
+static void server_socket_check(void)
+{
+	int32_t num = num_avail_sockets();
+
+	if (coro_server_fd == -1 && num > 0) {
+		server_socket_publish();
+	}
+	else if (coro_server_fd != -1 && num <= 0) {
+		server_socket_withdraw();
+	}
 }
 
 void coroipcs_ipc_exit (void)
@@ -1447,6 +1513,9 @@ retry_accept:
 		strerror_r (errno, error_str, 100);
 		log_printf (LOGSYS_LEVEL_ERROR,
 			"Could not accept Library connection: %s\n", error_str);
+		if (errno == EMFILE || errno == ENFILE) {
+			server_socket_withdraw();
+		}
 		return (0); /* This is an error, but -1 would indicate disconnect from poll loop */
 	}
 
@@ -1476,6 +1545,7 @@ retry_accept:
 	if (res != 0) {
 		close (new_fd);
 	}
+	server_socket_check();
 
 	return (0);
 }
-- 
1.7.1



More information about the Openais mailing list