[Openais] [PATCH corosync] add a monitoring and recovery service

Angus Salkeld asalkeld at redhat.com
Tue Mar 30 22:24:51 PDT 2010


Hi

This patch adds two new services:
mr: monitoring service (uses statgrab)
wd: watchdog service

The monitoring service uses libstatgrab to get memory and load
stats (it can do a lot more).
1) It puts the current value into a key "current" alongside the max value.
2) it compares the current to the max
  If current > max it set state = "failed"
  Else it set state = "good"

The watchdog service only does anything if:
1) you have a /dev/watchdog
2) there are resources with recovery == watchdog

It then opens /dev/watchdog and starts a timer 1/2 the timeout of
the watchdog.
At each timeout it only tickles the watchdog if /resources/*/state!=failed

Been an early patch I have the bit that writes state=[good|failed] is 
commented out to make testing easier.

To test (on a VM where you don't mind the machine shutting down):
Add the following to your config:
service {
	name: corosync_mr
	ver: 0
}
service {
	name: corosync_wd
	ver: 0
}
resources {
  system {
    memory_used {
      state: unknown
      # max here is the percent of total mem used
      max: 80
      recovery: watchdog
    }
    load_15min {
      state: unknown
      # max here is the actual loadaverage
      max: 20
      recovery: watchdog
    }
  }
}

Then
$ modprobe softdog
$ /etc/init.d/corosync start

To view the stats
$ corosync-objctl resources.

To trigger a watchdog timeout
$ corosync-objctl -w resources.system.memory_used.state=failed

Regards
Angus

Signed-off-by: Angus Salkeld <asalkeld at redhat.com>
---
 configure.ac                |   30 +++-
 exec/Makefile.am            |    2 +-
 include/corosync/corodefs.h |    4 +-
 services/Makefile.am        |    5 +-
 services/mr.c               |  341 +++++++++++++++++++++++++++++++++++
 services/wd.c               |  420 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 791 insertions(+), 11 deletions(-)
 create mode 100644 services/mr.c
 create mode 100644 services/wd.c

diff --git a/configure.ac b/configure.ac
index 59f10fb..99b5666 100644
--- a/configure.ac
+++ b/configure.ac
@@ -182,19 +182,19 @@ CONFDB_SONAME="${SOMAJOR}.${CONFDB_SOMINOR}.${CONFDB_SOMICRO}"
 
 # local options
 AC_ARG_ENABLE([ansi],
-	[  --enable-ansi           : force to build with ANSI standards. ],
+	[  --enable-ansi                   : force to build with ANSI standards. ],
 	[ default="no" ])
 
 AC_ARG_ENABLE([fatal-warnings],
-	[  --enable-fatal-warnings : enable fatal warnings. ],
+	[  --enable-fatal-warnings         : enable fatal warnings. ],
 	[ default="no" ])
 
 AC_ARG_ENABLE([debug],
-	[ --enable-debug          : enable debug build. ],
+	[  --enable-debug                  : enable debug build. ],
 	[ default="no" ])
 
 AC_ARG_ENABLE([coverage],
-	[  --enable-coverage       : coverage analysis of the codebase. ],
+	[  --enable-coverage               : coverage analysis of the codebase. ],
 	[ default="no" ])
 
 AC_ARG_ENABLE([small-memory-footprint],
@@ -202,20 +202,26 @@ AC_ARG_ENABLE([small-memory-footprint],
 	[ default="no" ])
 
 AC_ARG_ENABLE([nss],
-	[  --enable-nss            : Network Security Services encryption. ],,
+	[  --enable-nss                    : Network Security Services encryption. ],,
 	[ enable_nss="yes" ])
 
 AC_ARG_ENABLE([testagents],
-	[  --enable-testagents            : Install Test Agents. ],,
+	[  --enable-testagents             : Install Test Agents. ],,
 	[ default="no" ])
 
 AC_ARG_ENABLE([rdma],
-	[  --enable-rdma           : Infiniband RDMA transport support ],,
+	[  --enable-rdma                   : Infiniband RDMA transport support ],,
 	[ enable_rdma="no" ])
 AM_CONDITIONAL(BUILD_RDMA, test x$enable_rdma = xyes)
 
+AC_ARG_ENABLE([statgrab],
+	[  --enable-statgrab               : statgrab resource monitoring ],,
+	[ default="no" ])
+AM_CONDITIONAL(HAVE_STATGRAB, test x$enable_statgrab = xyes)
+
+
 AC_ARG_ENABLE([augeas],
-	[  --enable-augeas           : Install the augeas lens for corosync.conf ],,
+	[  --enable-augeas                 : Install the augeas lens for corosync.conf ],,
 	[ enable_augeas="no" ])
 AM_CONDITIONAL(INSTALL_AUGEAS, test x$enable_augeas = xyes)
 
@@ -349,6 +355,14 @@ if test "x${enable_rdma}" = xyes; then
 	PACKAGE_FEATURES="$PACKAGE_FEATURES rdma"
 fi
 
+if test "x${enable_statgrab}" = xyes; then
+	AC_CHECK_LIB([statgrab], [sg_get_mem_stats])
+	AC_CHECK_HEADERS([statgrab.h])
+	statgrab_LIBS="-lstatgrab"
+	AC_SUBST([statgrab_LIBS])
+	PACKAGE_FEATURES="$PACKAGE_FEATURES statgrab"
+fi
+
 if test "x${enable_augeas}" = xyes; then
 	PACKAGE_FEATURES="$PACKAGE_FEATURES augeas"
 fi
diff --git a/exec/Makefile.am b/exec/Makefile.am
index f367f29..7c207c3 100644
--- a/exec/Makefile.am
+++ b/exec/Makefile.am
@@ -59,7 +59,7 @@ libcoroipcs_a_SOURCES	= $(COROIPCS_SRC)
 corosync_SOURCES 	= main.c util.c sync.c apidef.c service.c \
 			  timer.c totemconfig.c mainconfig.c quorum.c schedwrk.c \
 			  ../lcr/lcr_ifact.c evil.c syncv2.c
-corosync_LDADD	  	= -ltotem_pg -llogsys -lcoroipcs
+corosync_LDADD	  	= -ltotem_pg -llogsys -lcoroipcs $(statgrab_LIBS)
 corosync_DEPENDENCIES	= libtotem_pg.so.$(SONAME) liblogsys.so.$(SONAME) libcoroipcs.so.$(SONAME)
 corosync_LDFLAGS	= $(OS_DYFLAGS) -L./
 
diff --git a/include/corosync/corodefs.h b/include/corosync/corodefs.h
index 57923e2..8a81327 100644
--- a/include/corosync/corodefs.h
+++ b/include/corosync/corodefs.h
@@ -59,7 +59,9 @@ enum corosync_service_types {
 	NTF_SERVICE = 16,
 	AMF_V2_SERVICE = 17,
 	TST_SV1_SERVICE = 18,
-	TST_SV2_SERVICE = 19
+	TST_SV2_SERVICE = 19,
+	MR_SERVICE = 20,
+	WD_SERVICE = 21
 };
 
 #ifdef HAVE_SMALL_MEMORY_FOOTPRINT
diff --git a/services/Makefile.am b/services/Makefile.am
index bb63336..7c17409 100644
--- a/services/Makefile.am
+++ b/services/Makefile.am
@@ -37,7 +37,10 @@ INCLUDES		= -I$(top_builddir)/include -I$(top_srcdir)/include \
 			  -I$(top_builddir)/include/corosync \
 			  -I$(top_srcdir)/include/corosync
 
-SERVICE_LCRSO		= evs cfg cpg confdb pload
+SERVICE_LCRSO		= evs cfg cpg confdb pload wd
+if HAVE_STATGRAB
+SERVICE_LCRSO		+= mr
+endif
 
 QUORUM_LCRSO		= votequorum testquorum
 
diff --git a/services/mr.c b/services/mr.c
new file mode 100644
index 0000000..39cbddf
--- /dev/null
+++ b/services/mr.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc.
+ *
+ * All rights reserved.
+ *
+ * Author: Angus Salkeld <asalkeld at redhat.com>
+ *
+ * This software licensed under BSD license, the text of which follows:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the MontaVista Software, Inc. nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <config.h>
+
+#include <unistd.h>
+#include <statgrab.h>
+
+#include <corosync/corotypes.h>
+#include <corosync/corodefs.h>
+#include <corosync/lcr/lcr_comp.h>
+#include <corosync/engine/coroapi.h>
+#include <corosync/list.h>
+#include <corosync/engine/logsys.h>
+
+
+LOGSYS_DECLARE_SUBSYS ("MR");
+
+/*
+ * Service Interfaces required by service_message_handler struct
+ */
+static int mr_exec_init_fn (
+	struct corosync_api_v1 *corosync_api);
+
+static void mr_confchg_fn (
+	enum totem_configuration_type configuration_type,
+	const unsigned int *member_list, size_t member_list_entries,
+	const unsigned int *left_list, size_t left_list_entries,
+	const unsigned int *joined_list, size_t joined_list_entries,
+	const struct memb_ring_id *ring_id);
+
+
+static int mr_lib_init_fn (void *conn);
+
+static int mr_lib_exit_fn (void *conn);
+
+static struct corosync_api_v1 *api;
+
+static hdb_handle_t memory_used_obj;
+static hdb_handle_t load_15min_obj;
+static pthread_t mr_poll_thread;
+
+struct corosync_service_engine mr_service_engine = {
+	.name			= "corosync monitoring and recovery service",
+	.id			= MR_SERVICE,
+	.priority		= 1,
+	.private_data_size	= 0,
+	.flow_control		= CS_LIB_FLOW_CONTROL_REQUIRED,
+	.lib_init_fn		= mr_lib_init_fn,
+	.lib_exit_fn		= mr_lib_exit_fn,
+	.lib_engine		= NULL,
+	.lib_engine_count	= 0,
+	.exec_engine		= NULL,
+	.exec_engine_count	= 0,
+	.confchg_fn		= mr_confchg_fn,
+	.exec_init_fn		= mr_exec_init_fn,
+	.exec_dump_fn		= NULL,
+	.sync_mode		= CS_SYNC_V2
+};
+
+static DECLARE_LIST_INIT (confchg_notify);
+
+/*
+ * Dynamic loading descriptor
+ */
+
+static struct corosync_service_engine *mr_get_service_engine_ver0 (void);
+
+static struct corosync_service_engine_iface_ver0 mr_service_engine_iface = {
+	.corosync_get_service_engine_ver0	= mr_get_service_engine_ver0
+};
+
+static struct lcr_iface corosync_mr_ver0[1] = {
+	{
+		.name			= "corosync_mr",
+		.version		= 0,
+		.versions_replace	= 0,
+		.versions_replace_count = 0,
+		.dependencies		= 0,
+		.dependency_count	= 0,
+		.constructor		= NULL,
+		.destructor		= NULL,
+		.interfaces		= NULL,
+	}
+};
+
+static struct lcr_comp mr_comp_ver0 = {
+	.iface_count	= 1,
+	.ifaces		= corosync_mr_ver0
+};
+
+static struct corosync_service_engine *mr_get_service_engine_ver0 (void)
+{
+	return (&mr_service_engine);
+}
+
+#ifdef COROSYNC_SOLARIS
+void corosync_lcr_component_register (void);
+
+void corosync_lcr_component_register (void) {
+#else
+__attribute__ ((constructor)) static void corosync_lcr_component_register (void) {
+#endif
+	lcr_interfaces_set (&corosync_mr_ver0[0], &mr_service_engine_iface);
+
+	lcr_component_register (&mr_comp_ver0);
+}
+
+static void update_resource (hdb_handle_t resourse,
+	void *new_value, size_t new_value_size, objdb_value_types_t type)
+{
+	char *max_str;
+	size_t max_str_len;
+	objdb_value_types_t max_type;
+	char failed[] = "failed";
+	char good[] = "good";
+	char * status = good;
+	double maxd;
+	long maxl;
+
+	/* update the current key
+	 */
+	api->object_key_replace (resourse,
+		"current", strlen("current"),
+		new_value, new_value_size);
+
+	/* if it exceeds the max value then set the state to failed, else good
+	 */
+	if (api->object_key_get_typed (resourse, "max", (void**)&max_str, &max_str_len, &max_type) != 0) {
+		return;
+	}
+	switch (type) {
+	case OBJDB_VALUETYPE_DOUBLE:
+		maxd = strtod (max_str, NULL);
+		if (*((double*)new_value) > maxd)
+			status = failed;
+		break;
+
+	default:
+		maxl = strtol (max_str, NULL, 0);
+		if (*((uint32_t*)new_value) > maxl)
+			status = failed;
+		break;
+	}
+/*
+	api->object_key_replace (resourse,
+		"state", strlen("state"),
+		status, strlen(status));
+*/
+}
+
+static void *mr_thread_handler (void * unused)
+{
+	sg_mem_stats *mem_stats;
+	sg_swap_stats *swap_stats;
+	long long total, freemem;
+	uint32_t new_value;
+	sg_load_stats *load_stats;
+
+	sg_init();
+
+	while (1) {
+		mem_stats = sg_get_mem_stats();
+		swap_stats = sg_get_swap_stats();
+
+		if (mem_stats != NULL && swap_stats != NULL) {
+			total = mem_stats->total + swap_stats->total;
+			freemem = mem_stats->free + swap_stats->free;
+			new_value = ((total - freemem) * 100) / total;
+			update_resource (memory_used_obj, &new_value, sizeof(new_value), OBJDB_VALUETYPE_UINT32);
+		}
+		else {
+			log_printf (LOGSYS_LEVEL_ERROR, "Unable to get VM stats: %s\n",
+				sg_str_error(sg_get_error()));
+		}
+
+		load_stats = sg_get_load_stats ();
+		if (load_stats) {
+			update_resource (load_15min_obj,
+				&load_stats->min15, sizeof (load_stats->min15), OBJDB_VALUETYPE_DOUBLE);
+		}
+		else {
+			log_printf (LOGSYS_LEVEL_ERROR, "Unable to get VM stats: %s\n",
+				sg_str_error(sg_get_error()));
+		}
+
+		sleep(30);
+		//sleep(5);
+	}
+	return NULL;
+}
+
+static int object_find_or_create (
+	hdb_handle_t parent_object_handle,
+	hdb_handle_t *object_handle,
+	const void *object_name,
+	size_t object_name_len)
+{
+	hdb_handle_t obj_finder;
+	hdb_handle_t obj;
+	int ret = -1;
+
+	api->object_find_create (
+		parent_object_handle,
+		object_name,
+		object_name_len,
+		&obj_finder);
+
+	if (api->object_find_next (obj_finder, &obj) == 0) {
+		/* found it */
+		*object_handle = obj;
+		ret = 0;
+	}
+	else {
+		ret = api->object_create (parent_object_handle,
+			object_handle,
+			object_name, object_name_len);
+	}
+
+	api->object_find_destroy (obj_finder);
+	return ret;
+}
+
+
+static int mr_exec_init_fn (
+	struct corosync_api_v1 *corosync_api)
+{
+	hdb_handle_t obj;
+	hdb_handle_t parent;
+	int32_t zero_32 = 0;
+	double zero_double = 0;
+
+	log_printf (LOGSYS_LEVEL_INFO, "%s\n", __func__);
+#ifdef COROSYNC_SOLARIS
+	logsys_subsys_init();
+#endif
+	api = corosync_api;
+
+	parent = OBJECT_PARENT_HANDLE;
+	object_find_or_create (parent,
+		&obj,
+		"resources", strlen ("resources"));
+	parent = obj;
+	object_find_or_create (parent,
+		&obj,
+		"system", strlen ("system"));
+	parent = obj;
+
+	object_find_or_create (parent,
+		&memory_used_obj,
+		"memory_used", strlen ("memory_used"));
+	api->object_key_create_typed (memory_used_obj,
+		"current", &zero_32,
+		sizeof (zero_32), OBJDB_VALUETYPE_UINT32);
+
+	object_find_or_create (parent,
+		&load_15min_obj,
+		"load_15min", strlen ("load_15min"));
+	api->object_key_create_typed (load_15min_obj,
+		"current", &zero_double,
+		sizeof (zero_double), OBJDB_VALUETYPE_DOUBLE);
+
+
+/*
+
+/resourses/
+	system/
+		memory_used/
+			max = <X>%
+			current = ()
+		load_15min/
+			max = <X>%
+		used_inodes/
+			max = <X>%
+		used_blocks/
+			max = <X>%
+		net_in_errors/
+			max = <X>%
+		net_out_errors/
+			max = <X>%
+
+	processes/
+		<name|pid>
+
+
+*/
+	pthread_create (&mr_poll_thread, NULL, mr_thread_handler, NULL);
+
+	return 0;
+}
+
+static void mr_confchg_fn (
+	enum totem_configuration_type configuration_type,
+	const unsigned int *member_list, size_t member_list_entries,
+	const unsigned int *left_list, size_t left_list_entries,
+	const unsigned int *joined_list, size_t joined_list_entries,
+	const struct memb_ring_id *ring_id)
+{
+}
+
+static int mr_lib_init_fn (void *conn)
+{
+	return (0);
+}
+
+static int mr_lib_exit_fn (void *conn)
+{
+	return (0);
+}
+
diff --git a/services/wd.c b/services/wd.c
new file mode 100644
index 0000000..3fb1db0
--- /dev/null
+++ b/services/wd.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc.
+ *
+ * All rights reserved.
+ *
+ * Author: Angus Salkeld <asalkeld at redhat.com>
+ *
+ * This software licensed under BSD license, the text of which follows:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the MontaVista Software, Inc. nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <config.h>
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+#include <linux/watchdog.h>
+
+#include <corosync/corotypes.h>
+#include <corosync/corodefs.h>
+#include <corosync/lcr/lcr_comp.h>
+#include <corosync/engine/coroapi.h>
+#include <corosync/list.h>
+#include <corosync/engine/logsys.h>
+
+
+typedef enum {
+	WD_RESOURCE_GOOD,
+	WD_RESOURCE_FAILED,
+	WD_RESOURCE_UNKNOWN,
+	WD_RESOURCE_NOT_MONITORED
+} wd_resource_state_t;
+
+
+LOGSYS_DECLARE_SUBSYS ("WD");
+
+/*
+ * Service Interfaces required by service_message_handler struct
+ */
+static int wd_exec_init_fn (
+	struct corosync_api_v1 *corosync_api);
+static int wd_exec_exit_fn (void);
+
+static void wd_confchg_fn (
+	enum totem_configuration_type configuration_type,
+	const unsigned int *member_list, size_t member_list_entries,
+	const unsigned int *left_list, size_t left_list_entries,
+	const unsigned int *joined_list, size_t joined_list_entries,
+	const struct memb_ring_id *ring_id);
+
+
+static int wd_lib_init_fn (void *conn);
+static int wd_lib_exit_fn (void *conn);
+
+static struct corosync_api_v1 *api;
+static uint32_t wd_timeout = 5;
+static int dog;
+static corosync_timer_handle_t wd_timer;
+
+struct corosync_service_engine wd_service_engine = {
+	.name			= "corosync watchdog fencing service",
+	.id			= WD_SERVICE,
+	.priority		= 1,
+	.private_data_size	= 0,
+	.flow_control		= CS_LIB_FLOW_CONTROL_REQUIRED,
+	.lib_init_fn		= wd_lib_init_fn,
+	.lib_exit_fn		= wd_lib_exit_fn,
+	.lib_engine		= NULL,
+	.lib_engine_count	= 0,
+	.exec_engine		= NULL,
+	.exec_engine_count	= 0,
+	.confchg_fn		= wd_confchg_fn,
+	.exec_init_fn		= wd_exec_init_fn,
+	.exec_exit_fn		= wd_exec_exit_fn,
+	.exec_dump_fn		= NULL,
+	.sync_mode		= CS_SYNC_V2
+};
+
+static DECLARE_LIST_INIT (confchg_notify);
+
+/*
+ * Dynamic loading descriptor
+ */
+
+static struct corosync_service_engine *wd_get_service_engine_ver0 (void);
+
+static struct corosync_service_engine_iface_ver0 wd_service_engine_iface = {
+	.corosync_get_service_engine_ver0	= wd_get_service_engine_ver0
+};
+
+static struct lcr_iface corosync_wd_ver0[1] = {
+	{
+		.name			= "corosync_wd",
+		.version		= 0,
+		.versions_replace	= 0,
+		.versions_replace_count = 0,
+		.dependencies		= 0,
+		.dependency_count	= 0,
+		.constructor		= NULL,
+		.destructor		= NULL,
+		.interfaces		= NULL,
+	}
+};
+
+static struct lcr_comp wd_comp_ver0 = {
+	.iface_count	= 1,
+	.ifaces		= corosync_wd_ver0
+};
+
+static struct corosync_service_engine *wd_get_service_engine_ver0 (void)
+{
+	return (&wd_service_engine);
+}
+
+#ifdef COROSYNC_SOLARIS
+void corosync_lcr_component_register (void);
+
+void corosync_lcr_component_register (void) {
+#else
+__attribute__ ((constructor)) static void corosync_lcr_component_register (void) {
+#endif
+	lcr_interfaces_set (&corosync_wd_ver0[0], &wd_service_engine_iface);
+
+	lcr_component_register (&wd_comp_ver0);
+}
+
+static wd_resource_state_t resource_state_get (hdb_handle_t resource)
+{
+	int res;
+	char *recov;
+	size_t recov_len;
+	char *state;
+	size_t state_len;
+	objdb_value_types_t type;
+
+	res = api->object_key_get_typed (resource,
+		"recovery", (void*)&recov, &recov_len, &type);
+	if (res != 0) {
+		/* key does not exist.
+		 */
+		return WD_RESOURCE_NOT_MONITORED;
+	}
+	res = api->object_key_get_typed (resource,
+		"state", (void*)&state, &state_len, &type);
+	if (res != 0) {
+		/* key does not exist.
+		 */
+		return WD_RESOURCE_NOT_MONITORED;
+	}
+
+	if (strcmp (recov, "watchdog") == 0) {
+		if (strcmp (state, "failed") == 0) {
+			return WD_RESOURCE_FAILED;
+		}
+		else if (strcmp (state, "good") == 0) {
+			return WD_RESOURCE_GOOD;
+		}
+		else {
+			return WD_RESOURCE_UNKNOWN;
+		}
+	}
+	return WD_RESOURCE_NOT_MONITORED;
+}
+
+
+static void wd_tickle_fn (void* arg)
+{
+	hdb_handle_t parent;
+	hdb_handle_t obj_finder;
+	hdb_handle_t obj_finder2;
+	hdb_handle_t resources;
+	hdb_handle_t resource_type;
+	hdb_handle_t resource;
+	int res;
+	char object_name[128];
+	size_t object_name_len;
+	char object_name2[128];
+	size_t object_name2_len;
+	int all_ok = 1;
+	wd_resource_state_t state;
+
+	parent = OBJECT_PARENT_HANDLE;
+	api->object_find_create (
+		OBJECT_PARENT_HANDLE,
+		"resources", strlen ("resources"),
+		&obj_finder);
+
+	res = api->object_find_next (obj_finder, &resources);
+	api->object_find_destroy (obj_finder);
+	if (res != 0) {
+		return;
+	}
+
+	/* this will be the system or processes level
+	 */
+	api->object_find_create (
+		resources,
+		NULL, 0,
+		&obj_finder);
+	while (api->object_find_next (obj_finder,
+			&resource_type) == 0) {
+		api->object_name_get (resource_type,
+			object_name,
+			&object_name_len);
+
+		api->object_find_create (
+			resource_type,
+			NULL, 0,
+			&obj_finder2);
+
+		while (api->object_find_next (obj_finder2,
+				&resource) == 0) {
+
+			api->object_name_get (resource,
+				object_name2,
+				&object_name2_len);
+
+			state = resource_state_get (resource);
+			if (state == WD_RESOURCE_FAILED) {
+				all_ok = 0;
+				log_printf (LOGSYS_LEVEL_CRIT,
+					"/resources/%s/%s failed!",
+					(char*)object_name, (char*)object_name2);
+			}
+			else if (state == WD_RESOURCE_GOOD) {
+				log_printf (LOGSYS_LEVEL_INFO,
+					"/resources/%s/%s good.",
+					(char*)object_name, (char*)object_name2);
+			}
+			else {
+				log_printf (LOGSYS_LEVEL_INFO,
+					"/resources/%s/%s not monitored.",
+					(char*)object_name, (char*)object_name2);
+			}
+		}
+		api->object_find_destroy (obj_finder2);
+	}
+	api->object_find_destroy (obj_finder);
+
+	if (all_ok) {
+		/* tickle */
+		ioctl(dog, WDIOC_KEEPALIVE, &all_ok);
+
+		log_printf (LOGSYS_LEVEL_INFO,
+			"all watchdog'ed resources are good.");
+	}
+	else {
+		log_printf (LOGSYS_LEVEL_ALERT,
+			"all watchdog'ed resources are NOT good, NOT tickling the watchdog!");
+	}
+
+	api->timer_add_duration((unsigned long long)wd_timeout*1000000000, NULL,
+				wd_tickle_fn, &wd_timer);
+}
+
+static int num_resources_need_watchdog(void)
+{
+	hdb_handle_t parent;
+	hdb_handle_t obj_finder;
+	hdb_handle_t obj_finder2;
+	hdb_handle_t resources;
+	hdb_handle_t resource_type;
+	hdb_handle_t resource;
+	int res;
+	int number = 0;
+
+	parent = OBJECT_PARENT_HANDLE;
+	api->object_find_create (
+		OBJECT_PARENT_HANDLE,
+		"resources", strlen ("resources"),
+		&obj_finder);
+
+	res = api->object_find_next (obj_finder, &resources);
+	api->object_find_destroy (obj_finder);
+	if (res != 0) {
+		return number;
+	}
+
+	/* this will be the system or processes level
+	 */
+	api->object_find_create (
+		resources,
+		NULL, 0,
+		&obj_finder);
+	while (api->object_find_next (obj_finder,
+			&resource_type) == 0) {
+
+		api->object_find_create (
+			resource_type,
+			NULL, 0,
+			&obj_finder2);
+
+		while (api->object_find_next (obj_finder2,
+				&resource) == 0) {
+			if (resource_state_get (resource) != WD_RESOURCE_NOT_MONITORED) {
+				number++;
+			}
+		}
+		api->object_find_destroy (obj_finder2);
+	}
+	api->object_find_destroy (obj_finder);
+
+	return number;
+}
+
+static int setup_watchdog(void)
+{
+	struct watchdog_info ident;
+
+	if (access ("/dev/watchdog", W_OK) != 0) {
+		log_printf (LOGSYS_LEVEL_WARNING, "No Watchdog, try modprobe <a watchdog>");
+		return -1;
+	}
+
+	/* here goes, lets hope they have "Magic Close"
+	 */
+	dog = open("/dev/watchdog", O_WRONLY);
+
+	if (dog == -1) {
+		log_printf (LOGSYS_LEVEL_WARNING, "Watchdog exists but couldn't be opened.");
+		return -1;
+	}
+
+	/* Right we have the dog.
+	 * Lets see what breed it is.
+	 */
+
+	ioctl(dog, WDIOC_GETSUPPORT, &ident);
+	log_printf (LOGSYS_LEVEL_INFO, "Watchdog is now been tickled by corosync.");
+	log_printf (LOGSYS_LEVEL_DEBUG, "%s", ident.identity);
+
+	if (ident.options & WDIOF_SETTIMEOUT) {
+		/* yay! the dog is trained.
+		 */
+		/* TODO set this up from a config option */
+	}
+	ioctl(dog, WDIOC_GETTIMEOUT, &wd_timeout);
+	log_printf (LOGSYS_LEVEL_DEBUG, "The timeout is %d seconds\n", wd_timeout);
+	wd_timeout = wd_timeout / 2;
+
+	ioctl(dog, WDIOC_SETOPTIONS, WDIOS_ENABLECARD);
+	return 0;
+}
+
+static int wd_exec_init_fn (
+	struct corosync_api_v1 *corosync_api)
+{
+
+#ifdef COROSYNC_SOLARIS
+	logsys_subsys_init();
+#endif
+	log_printf (LOGSYS_LEVEL_INFO, "%s\n", __func__);
+	api = corosync_api;
+
+	if (num_resources_need_watchdog() == 0)
+		return -1;
+
+	/* this will setup wd_timeout */
+	if (setup_watchdog() != 0)
+		return -1;
+
+	api->timer_add_duration((unsigned long long)wd_timeout*1000000000, NULL,
+				wd_tickle_fn, &wd_timer);
+
+	return 0;
+}
+
+static int wd_exec_exit_fn (void)
+{
+	char magic = 'V';
+	if (dog > 0) {
+		log_printf (LOGSYS_LEVEL_INFO, "%s: magic close.\n", __func__);
+		write (dog, &magic, 1);
+	}
+	return 0;
+}
+
+static void wd_confchg_fn (
+	enum totem_configuration_type configuration_type,
+	const unsigned int *member_list, size_t member_list_entries,
+	const unsigned int *left_list, size_t left_list_entries,
+	const unsigned int *joined_list, size_t joined_list_entries,
+	const struct memb_ring_id *ring_id)
+{
+}
+
+static int wd_lib_init_fn (void *conn)
+{
+	return (0);
+}
+
+static int wd_lib_exit_fn (void *conn)
+{
+	return (0);
+}
+
-- 
1.6.6.1




More information about the Openais mailing list