[Openais] [PATCH openais whitetank] IPC: gracefully handle running out of file descriptors. (version
Steven Dake
sdake at redhat.com
Mon Aug 9 10:03:14 PDT 2010
On 08/08/2010 10:40 PM, Angus Salkeld wrote:
> On Sun, Aug 08, 2010 at 10:17:43PM -0700, Steven Dake wrote:
>> The model is when server has too many sockets in use, library
>> returns TRY_AGAIN?
> Hi Steve
>
> No, if the server runs out of fds then we shutdown the listening
> socket. The library returns LIB error, I believe.
>
> Then when we have more fds we setup the listening socket again.
>
> Is there any point of try again in this situation? I would say this
> is not a "normal" error and probalby shows a machine setup incorrectly.
> If processes start and quitely keep trying again it might not help anyone.
>
> -Angus
>
The check is done at ipc connection right? In that case, my guess is it
would appear the server is not operating and return "TRY_AGAIN".
In this case, the library should return CS_ERR_NO_RESOURCES, but I don't
think that is possible.
I suppose for future versions of corosync we need to think a little more
clearly on the possible error conditions that every api can return:
1. server out of resources
2. client out of resources
3. server too busy
4. server rejected security
5. server not operational
>>
>> Regards
>> -steve
>>
>> On 08/08/2010 08:01 PM, Angus Salkeld wrote:
>>> Whenever we accept a new connection or close an
>>> existing one, check the number of available file
>>> descriptors and either publish or withdraw the
>>> IPC listening socket.
>>>
>>> Signed-off-by: Angus Salkeld<asalkeld at redhat.com>
>>> ---
>>> exec/ipc.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
>>> 1 files changed, 86 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/exec/ipc.c b/exec/ipc.c
>>> index 5337d25..bf3102c 100644
>>> --- a/exec/ipc.c
>>> +++ b/exec/ipc.c
>>> @@ -99,6 +99,7 @@
>>> #define MSG_SEND_UNLOCKED 1
>>>
>>> static unsigned int g_gid_valid = 0;
>>> +static int32_t libais_server_fd = -1;
>>>
>>> static void (*ipc_serialize_lock_fn) (void);
>>>
>>> @@ -161,6 +162,15 @@ static int priv_change (struct conn_info *conn_info);
>>>
>>> static void ipc_disconnect (struct conn_info *conn_info);
>>>
>>> +static void server_socket_publish(void);
>>> +
>>> +static void server_socket_withdraw(void);
>>> +
>>> +static void server_socket_check(void);
>>> +
>>> +static int poll_handler_accept (poll_handle handle, int fd,
>>> + int revent, void *data);
>>> +
>>> static int ipc_thread_active (void *conn)
>>> {
>>> struct conn_info *conn_info = (struct conn_info *)conn;
>>> @@ -211,6 +221,7 @@ static inline int conn_info_destroy (struct conn_info *conn_info)
>>> conn_info->state == CONN_STATE_DISCONNECT_INACTIVE) {
>>> list_del (&conn_info->list);
>>> close (conn_info->fd);
>>> + server_socket_check();
>>> free (conn_info);
>>> return (-1);
>>> }
>>> @@ -257,6 +268,7 @@ static inline int conn_info_destroy (struct conn_info *conn_info)
>>> free (conn_info->private_data);
>>> }
>>> close (conn_info->fd);
>>> + server_socket_check();
>>> free (conn_info);
>>> ipc_serialize_unlock_fn();
>>> return (-1);
>>> @@ -773,7 +785,12 @@ retry_accept:
>>> }
>>>
>>> if (new_fd == -1) {
>>> - log_printf (LOG_LEVEL_ERROR, "ERROR: Could not accept Library connection: %s\n", strerror (errno));
>>> + log_printf (LOG_LEVEL_ERROR,
>>> + "ERROR: Could not accept Library connection: %s\n",
>>> + strerror (errno));
>>> + if (errno == EMFILE || errno == ENFILE) {
>>> + server_socket_withdraw();
>>> + }
>>> return (0); /* This is an error, but -1 would indicate disconnect from poll loop */
>>> }
>>>
>>> @@ -802,6 +819,7 @@ retry_accept:
>>> if (res != 0) {
>>> close (new_fd);
>>> }
>>> + server_socket_check();
>>>
>>> return (0);
>>> }
>>> @@ -835,14 +853,23 @@ void openais_ipc_init (
>>> void (*serialize_lock_fn) (void),
>>> void (*serialize_unlock_fn) (void))
>>> {
>>> - int libais_server_fd;
>>> - struct sockaddr_un un_addr;
>>> - int res;
>>> -
>>> ipc_serialize_lock_fn = serialize_lock_fn;
>>>
>>> ipc_serialize_unlock_fn = serialize_unlock_fn;
>>>
>>> + server_socket_publish();
>>> +
>>> + g_gid_valid = gid_valid;
>>> +}
>>> +
>>> +static void server_socket_publish(void)
>>> +{
>>> + int32_t res = 0;
>>> + struct sockaddr_un un_addr;
>>> +
>>> + log_printf(LOG_LEVEL_WARNING,
>>> + "Publishing socket for client connections.\n");
>>> +
>>> /*
>>> * Create socket for libais clients, name socket, listen for connections
>>> */
>>> @@ -885,8 +912,61 @@ void openais_ipc_init (
>>> */
>>> poll_dispatch_add (aisexec_poll_handle, libais_server_fd,
>>> POLLIN|POLLNVAL, 0, poll_handler_accept);
>>> +}
>>>
>>> - g_gid_valid = gid_valid;
>>> +static void server_socket_withdraw(void)
>>> +{
>>> + log_printf(LOG_LEVEL_WARNING,
>>> + "Withdrawing socket for client connections.\n");
>>> +
>>> + poll_dispatch_delete(aisexec_poll_handle, libais_server_fd);
>>> + shutdown(libais_server_fd, SHUT_RDWR);
>>> + close(libais_server_fd);
>>> + libais_server_fd = -1;
>>> +}
>>> +
>>> +/*
>>> + * The actual used sockets is 12 but allowing a larger number
>>> + * for safety.
>>> + */
>>> +#define COROIPC_NUM_RESERVED_SOCKETS 25
>>> +
>>> +static int32_t num_avail_sockets(void)
>>> +{
>>> + struct rlimit lim;
>>> + int32_t open_socks = 0;
>>> + int32_t res;
>>> + struct list_head *list;
>>> +
>>> + if (getrlimit(RLIMIT_NOFILE,&lim) == -1) {
>>> + char error_str[100];
>>> + strerror_r(errno, error_str, 100);
>>> + log_printf(LOG_LEVEL_ERROR,
>>> + "getrlimit: %s\n", error_str);
>>> + return -1;
>>> + }
>>> +
>>> + for (list = conn_info_list_head.next; list !=&conn_info_list_head;
>>> + list = list->next) {
>>> + open_socks++;
>>> + }
>>> + res = lim.rlim_cur - (open_socks + COROIPC_NUM_RESERVED_SOCKETS);
>>> + log_printf(LOG_LEVEL_DEBUG,
>>> + "(lim.rlim_cur:%lu - (open_socks:%d + reserved:%d) == %d\n",
>>> + lim.rlim_cur, open_socks, COROIPC_NUM_RESERVED_SOCKETS, res);
>>> + return res;
>>> +}
>>> +
>>> +static void server_socket_check(void)
>>> +{
>>> + int32_t num = num_avail_sockets();
>>> +
>>> + if (libais_server_fd == -1&& num> 0) {
>>> + server_socket_publish();
>>> + }
>>> + else if (libais_server_fd != -1&& num<= 0) {
>>> + server_socket_withdraw();
>>> + }
>>> }
>>>
>>> void openais_ipc_exit (void)
> _______________________________________________
> Openais mailing list
> Openais at lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/openais
More information about the Openais
mailing list