[Openais] [PATCH corosync] add a monitoring and recovery service
Angus Salkeld
asalkeld at redhat.com
Tue Mar 30 22:24:51 PDT 2010
Hi
This patch adds two new services:
mr: monitoring service (uses statgrab)
wd: watchdog service
The monitoring service uses libstatgrab to get memory and load
stats (it can do a lot more).
1) It puts the current value into a key "current" alongside the max value.
2) it compares the current to the max
If current > max it set state = "failed"
Else it set state = "good"
The watchdog service only does anything if:
1) you have a /dev/watchdog
2) there are resources with recovery == watchdog
It then opens /dev/watchdog and starts a timer 1/2 the timeout of
the watchdog.
At each timeout it only tickles the watchdog if /resources/*/state!=failed
Been an early patch I have the bit that writes state=[good|failed] is
commented out to make testing easier.
To test (on a VM where you don't mind the machine shutting down):
Add the following to your config:
service {
name: corosync_mr
ver: 0
}
service {
name: corosync_wd
ver: 0
}
resources {
system {
memory_used {
state: unknown
# max here is the percent of total mem used
max: 80
recovery: watchdog
}
load_15min {
state: unknown
# max here is the actual loadaverage
max: 20
recovery: watchdog
}
}
}
Then
$ modprobe softdog
$ /etc/init.d/corosync start
To view the stats
$ corosync-objctl resources.
To trigger a watchdog timeout
$ corosync-objctl -w resources.system.memory_used.state=failed
Regards
Angus
Signed-off-by: Angus Salkeld <asalkeld at redhat.com>
---
configure.ac | 30 +++-
exec/Makefile.am | 2 +-
include/corosync/corodefs.h | 4 +-
services/Makefile.am | 5 +-
services/mr.c | 341 +++++++++++++++++++++++++++++++++++
services/wd.c | 420 +++++++++++++++++++++++++++++++++++++++++++
6 files changed, 791 insertions(+), 11 deletions(-)
create mode 100644 services/mr.c
create mode 100644 services/wd.c
diff --git a/configure.ac b/configure.ac
index 59f10fb..99b5666 100644
--- a/configure.ac
+++ b/configure.ac
@@ -182,19 +182,19 @@ CONFDB_SONAME="${SOMAJOR}.${CONFDB_SOMINOR}.${CONFDB_SOMICRO}"
# local options
AC_ARG_ENABLE([ansi],
- [ --enable-ansi : force to build with ANSI standards. ],
+ [ --enable-ansi : force to build with ANSI standards. ],
[ default="no" ])
AC_ARG_ENABLE([fatal-warnings],
- [ --enable-fatal-warnings : enable fatal warnings. ],
+ [ --enable-fatal-warnings : enable fatal warnings. ],
[ default="no" ])
AC_ARG_ENABLE([debug],
- [ --enable-debug : enable debug build. ],
+ [ --enable-debug : enable debug build. ],
[ default="no" ])
AC_ARG_ENABLE([coverage],
- [ --enable-coverage : coverage analysis of the codebase. ],
+ [ --enable-coverage : coverage analysis of the codebase. ],
[ default="no" ])
AC_ARG_ENABLE([small-memory-footprint],
@@ -202,20 +202,26 @@ AC_ARG_ENABLE([small-memory-footprint],
[ default="no" ])
AC_ARG_ENABLE([nss],
- [ --enable-nss : Network Security Services encryption. ],,
+ [ --enable-nss : Network Security Services encryption. ],,
[ enable_nss="yes" ])
AC_ARG_ENABLE([testagents],
- [ --enable-testagents : Install Test Agents. ],,
+ [ --enable-testagents : Install Test Agents. ],,
[ default="no" ])
AC_ARG_ENABLE([rdma],
- [ --enable-rdma : Infiniband RDMA transport support ],,
+ [ --enable-rdma : Infiniband RDMA transport support ],,
[ enable_rdma="no" ])
AM_CONDITIONAL(BUILD_RDMA, test x$enable_rdma = xyes)
+AC_ARG_ENABLE([statgrab],
+ [ --enable-statgrab : statgrab resource monitoring ],,
+ [ default="no" ])
+AM_CONDITIONAL(HAVE_STATGRAB, test x$enable_statgrab = xyes)
+
+
AC_ARG_ENABLE([augeas],
- [ --enable-augeas : Install the augeas lens for corosync.conf ],,
+ [ --enable-augeas : Install the augeas lens for corosync.conf ],,
[ enable_augeas="no" ])
AM_CONDITIONAL(INSTALL_AUGEAS, test x$enable_augeas = xyes)
@@ -349,6 +355,14 @@ if test "x${enable_rdma}" = xyes; then
PACKAGE_FEATURES="$PACKAGE_FEATURES rdma"
fi
+if test "x${enable_statgrab}" = xyes; then
+ AC_CHECK_LIB([statgrab], [sg_get_mem_stats])
+ AC_CHECK_HEADERS([statgrab.h])
+ statgrab_LIBS="-lstatgrab"
+ AC_SUBST([statgrab_LIBS])
+ PACKAGE_FEATURES="$PACKAGE_FEATURES statgrab"
+fi
+
if test "x${enable_augeas}" = xyes; then
PACKAGE_FEATURES="$PACKAGE_FEATURES augeas"
fi
diff --git a/exec/Makefile.am b/exec/Makefile.am
index f367f29..7c207c3 100644
--- a/exec/Makefile.am
+++ b/exec/Makefile.am
@@ -59,7 +59,7 @@ libcoroipcs_a_SOURCES = $(COROIPCS_SRC)
corosync_SOURCES = main.c util.c sync.c apidef.c service.c \
timer.c totemconfig.c mainconfig.c quorum.c schedwrk.c \
../lcr/lcr_ifact.c evil.c syncv2.c
-corosync_LDADD = -ltotem_pg -llogsys -lcoroipcs
+corosync_LDADD = -ltotem_pg -llogsys -lcoroipcs $(statgrab_LIBS)
corosync_DEPENDENCIES = libtotem_pg.so.$(SONAME) liblogsys.so.$(SONAME) libcoroipcs.so.$(SONAME)
corosync_LDFLAGS = $(OS_DYFLAGS) -L./
diff --git a/include/corosync/corodefs.h b/include/corosync/corodefs.h
index 57923e2..8a81327 100644
--- a/include/corosync/corodefs.h
+++ b/include/corosync/corodefs.h
@@ -59,7 +59,9 @@ enum corosync_service_types {
NTF_SERVICE = 16,
AMF_V2_SERVICE = 17,
TST_SV1_SERVICE = 18,
- TST_SV2_SERVICE = 19
+ TST_SV2_SERVICE = 19,
+ MR_SERVICE = 20,
+ WD_SERVICE = 21
};
#ifdef HAVE_SMALL_MEMORY_FOOTPRINT
diff --git a/services/Makefile.am b/services/Makefile.am
index bb63336..7c17409 100644
--- a/services/Makefile.am
+++ b/services/Makefile.am
@@ -37,7 +37,10 @@ INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include \
-I$(top_builddir)/include/corosync \
-I$(top_srcdir)/include/corosync
-SERVICE_LCRSO = evs cfg cpg confdb pload
+SERVICE_LCRSO = evs cfg cpg confdb pload wd
+if HAVE_STATGRAB
+SERVICE_LCRSO += mr
+endif
QUORUM_LCRSO = votequorum testquorum
diff --git a/services/mr.c b/services/mr.c
new file mode 100644
index 0000000..39cbddf
--- /dev/null
+++ b/services/mr.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc.
+ *
+ * All rights reserved.
+ *
+ * Author: Angus Salkeld <asalkeld at redhat.com>
+ *
+ * This software licensed under BSD license, the text of which follows:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * - Neither the name of the MontaVista Software, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <config.h>
+
+#include <unistd.h>
+#include <statgrab.h>
+
+#include <corosync/corotypes.h>
+#include <corosync/corodefs.h>
+#include <corosync/lcr/lcr_comp.h>
+#include <corosync/engine/coroapi.h>
+#include <corosync/list.h>
+#include <corosync/engine/logsys.h>
+
+
+LOGSYS_DECLARE_SUBSYS ("MR");
+
+/*
+ * Service Interfaces required by service_message_handler struct
+ */
+static int mr_exec_init_fn (
+ struct corosync_api_v1 *corosync_api);
+
+static void mr_confchg_fn (
+ enum totem_configuration_type configuration_type,
+ const unsigned int *member_list, size_t member_list_entries,
+ const unsigned int *left_list, size_t left_list_entries,
+ const unsigned int *joined_list, size_t joined_list_entries,
+ const struct memb_ring_id *ring_id);
+
+
+static int mr_lib_init_fn (void *conn);
+
+static int mr_lib_exit_fn (void *conn);
+
+static struct corosync_api_v1 *api;
+
+static hdb_handle_t memory_used_obj;
+static hdb_handle_t load_15min_obj;
+static pthread_t mr_poll_thread;
+
+struct corosync_service_engine mr_service_engine = {
+ .name = "corosync monitoring and recovery service",
+ .id = MR_SERVICE,
+ .priority = 1,
+ .private_data_size = 0,
+ .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED,
+ .lib_init_fn = mr_lib_init_fn,
+ .lib_exit_fn = mr_lib_exit_fn,
+ .lib_engine = NULL,
+ .lib_engine_count = 0,
+ .exec_engine = NULL,
+ .exec_engine_count = 0,
+ .confchg_fn = mr_confchg_fn,
+ .exec_init_fn = mr_exec_init_fn,
+ .exec_dump_fn = NULL,
+ .sync_mode = CS_SYNC_V2
+};
+
+static DECLARE_LIST_INIT (confchg_notify);
+
+/*
+ * Dynamic loading descriptor
+ */
+
+static struct corosync_service_engine *mr_get_service_engine_ver0 (void);
+
+static struct corosync_service_engine_iface_ver0 mr_service_engine_iface = {
+ .corosync_get_service_engine_ver0 = mr_get_service_engine_ver0
+};
+
+static struct lcr_iface corosync_mr_ver0[1] = {
+ {
+ .name = "corosync_mr",
+ .version = 0,
+ .versions_replace = 0,
+ .versions_replace_count = 0,
+ .dependencies = 0,
+ .dependency_count = 0,
+ .constructor = NULL,
+ .destructor = NULL,
+ .interfaces = NULL,
+ }
+};
+
+static struct lcr_comp mr_comp_ver0 = {
+ .iface_count = 1,
+ .ifaces = corosync_mr_ver0
+};
+
+static struct corosync_service_engine *mr_get_service_engine_ver0 (void)
+{
+ return (&mr_service_engine);
+}
+
+#ifdef COROSYNC_SOLARIS
+void corosync_lcr_component_register (void);
+
+void corosync_lcr_component_register (void) {
+#else
+__attribute__ ((constructor)) static void corosync_lcr_component_register (void) {
+#endif
+ lcr_interfaces_set (&corosync_mr_ver0[0], &mr_service_engine_iface);
+
+ lcr_component_register (&mr_comp_ver0);
+}
+
+static void update_resource (hdb_handle_t resourse,
+ void *new_value, size_t new_value_size, objdb_value_types_t type)
+{
+ char *max_str;
+ size_t max_str_len;
+ objdb_value_types_t max_type;
+ char failed[] = "failed";
+ char good[] = "good";
+ char * status = good;
+ double maxd;
+ long maxl;
+
+ /* update the current key
+ */
+ api->object_key_replace (resourse,
+ "current", strlen("current"),
+ new_value, new_value_size);
+
+ /* if it exceeds the max value then set the state to failed, else good
+ */
+ if (api->object_key_get_typed (resourse, "max", (void**)&max_str, &max_str_len, &max_type) != 0) {
+ return;
+ }
+ switch (type) {
+ case OBJDB_VALUETYPE_DOUBLE:
+ maxd = strtod (max_str, NULL);
+ if (*((double*)new_value) > maxd)
+ status = failed;
+ break;
+
+ default:
+ maxl = strtol (max_str, NULL, 0);
+ if (*((uint32_t*)new_value) > maxl)
+ status = failed;
+ break;
+ }
+/*
+ api->object_key_replace (resourse,
+ "state", strlen("state"),
+ status, strlen(status));
+*/
+}
+
+static void *mr_thread_handler (void * unused)
+{
+ sg_mem_stats *mem_stats;
+ sg_swap_stats *swap_stats;
+ long long total, freemem;
+ uint32_t new_value;
+ sg_load_stats *load_stats;
+
+ sg_init();
+
+ while (1) {
+ mem_stats = sg_get_mem_stats();
+ swap_stats = sg_get_swap_stats();
+
+ if (mem_stats != NULL && swap_stats != NULL) {
+ total = mem_stats->total + swap_stats->total;
+ freemem = mem_stats->free + swap_stats->free;
+ new_value = ((total - freemem) * 100) / total;
+ update_resource (memory_used_obj, &new_value, sizeof(new_value), OBJDB_VALUETYPE_UINT32);
+ }
+ else {
+ log_printf (LOGSYS_LEVEL_ERROR, "Unable to get VM stats: %s\n",
+ sg_str_error(sg_get_error()));
+ }
+
+ load_stats = sg_get_load_stats ();
+ if (load_stats) {
+ update_resource (load_15min_obj,
+ &load_stats->min15, sizeof (load_stats->min15), OBJDB_VALUETYPE_DOUBLE);
+ }
+ else {
+ log_printf (LOGSYS_LEVEL_ERROR, "Unable to get VM stats: %s\n",
+ sg_str_error(sg_get_error()));
+ }
+
+ sleep(30);
+ //sleep(5);
+ }
+ return NULL;
+}
+
+static int object_find_or_create (
+ hdb_handle_t parent_object_handle,
+ hdb_handle_t *object_handle,
+ const void *object_name,
+ size_t object_name_len)
+{
+ hdb_handle_t obj_finder;
+ hdb_handle_t obj;
+ int ret = -1;
+
+ api->object_find_create (
+ parent_object_handle,
+ object_name,
+ object_name_len,
+ &obj_finder);
+
+ if (api->object_find_next (obj_finder, &obj) == 0) {
+ /* found it */
+ *object_handle = obj;
+ ret = 0;
+ }
+ else {
+ ret = api->object_create (parent_object_handle,
+ object_handle,
+ object_name, object_name_len);
+ }
+
+ api->object_find_destroy (obj_finder);
+ return ret;
+}
+
+
+static int mr_exec_init_fn (
+ struct corosync_api_v1 *corosync_api)
+{
+ hdb_handle_t obj;
+ hdb_handle_t parent;
+ int32_t zero_32 = 0;
+ double zero_double = 0;
+
+ log_printf (LOGSYS_LEVEL_INFO, "%s\n", __func__);
+#ifdef COROSYNC_SOLARIS
+ logsys_subsys_init();
+#endif
+ api = corosync_api;
+
+ parent = OBJECT_PARENT_HANDLE;
+ object_find_or_create (parent,
+ &obj,
+ "resources", strlen ("resources"));
+ parent = obj;
+ object_find_or_create (parent,
+ &obj,
+ "system", strlen ("system"));
+ parent = obj;
+
+ object_find_or_create (parent,
+ &memory_used_obj,
+ "memory_used", strlen ("memory_used"));
+ api->object_key_create_typed (memory_used_obj,
+ "current", &zero_32,
+ sizeof (zero_32), OBJDB_VALUETYPE_UINT32);
+
+ object_find_or_create (parent,
+ &load_15min_obj,
+ "load_15min", strlen ("load_15min"));
+ api->object_key_create_typed (load_15min_obj,
+ "current", &zero_double,
+ sizeof (zero_double), OBJDB_VALUETYPE_DOUBLE);
+
+
+/*
+
+/resourses/
+ system/
+ memory_used/
+ max = <X>%
+ current = ()
+ load_15min/
+ max = <X>%
+ used_inodes/
+ max = <X>%
+ used_blocks/
+ max = <X>%
+ net_in_errors/
+ max = <X>%
+ net_out_errors/
+ max = <X>%
+
+ processes/
+ <name|pid>
+
+
+*/
+ pthread_create (&mr_poll_thread, NULL, mr_thread_handler, NULL);
+
+ return 0;
+}
+
+static void mr_confchg_fn (
+ enum totem_configuration_type configuration_type,
+ const unsigned int *member_list, size_t member_list_entries,
+ const unsigned int *left_list, size_t left_list_entries,
+ const unsigned int *joined_list, size_t joined_list_entries,
+ const struct memb_ring_id *ring_id)
+{
+}
+
+static int mr_lib_init_fn (void *conn)
+{
+ return (0);
+}
+
+static int mr_lib_exit_fn (void *conn)
+{
+ return (0);
+}
+
diff --git a/services/wd.c b/services/wd.c
new file mode 100644
index 0000000..3fb1db0
--- /dev/null
+++ b/services/wd.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc.
+ *
+ * All rights reserved.
+ *
+ * Author: Angus Salkeld <asalkeld at redhat.com>
+ *
+ * This software licensed under BSD license, the text of which follows:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * - Neither the name of the MontaVista Software, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <config.h>
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+#include <linux/watchdog.h>
+
+#include <corosync/corotypes.h>
+#include <corosync/corodefs.h>
+#include <corosync/lcr/lcr_comp.h>
+#include <corosync/engine/coroapi.h>
+#include <corosync/list.h>
+#include <corosync/engine/logsys.h>
+
+
+typedef enum {
+ WD_RESOURCE_GOOD,
+ WD_RESOURCE_FAILED,
+ WD_RESOURCE_UNKNOWN,
+ WD_RESOURCE_NOT_MONITORED
+} wd_resource_state_t;
+
+
+LOGSYS_DECLARE_SUBSYS ("WD");
+
+/*
+ * Service Interfaces required by service_message_handler struct
+ */
+static int wd_exec_init_fn (
+ struct corosync_api_v1 *corosync_api);
+static int wd_exec_exit_fn (void);
+
+static void wd_confchg_fn (
+ enum totem_configuration_type configuration_type,
+ const unsigned int *member_list, size_t member_list_entries,
+ const unsigned int *left_list, size_t left_list_entries,
+ const unsigned int *joined_list, size_t joined_list_entries,
+ const struct memb_ring_id *ring_id);
+
+
+static int wd_lib_init_fn (void *conn);
+static int wd_lib_exit_fn (void *conn);
+
+static struct corosync_api_v1 *api;
+static uint32_t wd_timeout = 5;
+static int dog;
+static corosync_timer_handle_t wd_timer;
+
+struct corosync_service_engine wd_service_engine = {
+ .name = "corosync watchdog fencing service",
+ .id = WD_SERVICE,
+ .priority = 1,
+ .private_data_size = 0,
+ .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED,
+ .lib_init_fn = wd_lib_init_fn,
+ .lib_exit_fn = wd_lib_exit_fn,
+ .lib_engine = NULL,
+ .lib_engine_count = 0,
+ .exec_engine = NULL,
+ .exec_engine_count = 0,
+ .confchg_fn = wd_confchg_fn,
+ .exec_init_fn = wd_exec_init_fn,
+ .exec_exit_fn = wd_exec_exit_fn,
+ .exec_dump_fn = NULL,
+ .sync_mode = CS_SYNC_V2
+};
+
+static DECLARE_LIST_INIT (confchg_notify);
+
+/*
+ * Dynamic loading descriptor
+ */
+
+static struct corosync_service_engine *wd_get_service_engine_ver0 (void);
+
+static struct corosync_service_engine_iface_ver0 wd_service_engine_iface = {
+ .corosync_get_service_engine_ver0 = wd_get_service_engine_ver0
+};
+
+static struct lcr_iface corosync_wd_ver0[1] = {
+ {
+ .name = "corosync_wd",
+ .version = 0,
+ .versions_replace = 0,
+ .versions_replace_count = 0,
+ .dependencies = 0,
+ .dependency_count = 0,
+ .constructor = NULL,
+ .destructor = NULL,
+ .interfaces = NULL,
+ }
+};
+
+static struct lcr_comp wd_comp_ver0 = {
+ .iface_count = 1,
+ .ifaces = corosync_wd_ver0
+};
+
+static struct corosync_service_engine *wd_get_service_engine_ver0 (void)
+{
+ return (&wd_service_engine);
+}
+
+#ifdef COROSYNC_SOLARIS
+void corosync_lcr_component_register (void);
+
+void corosync_lcr_component_register (void) {
+#else
+__attribute__ ((constructor)) static void corosync_lcr_component_register (void) {
+#endif
+ lcr_interfaces_set (&corosync_wd_ver0[0], &wd_service_engine_iface);
+
+ lcr_component_register (&wd_comp_ver0);
+}
+
+static wd_resource_state_t resource_state_get (hdb_handle_t resource)
+{
+ int res;
+ char *recov;
+ size_t recov_len;
+ char *state;
+ size_t state_len;
+ objdb_value_types_t type;
+
+ res = api->object_key_get_typed (resource,
+ "recovery", (void*)&recov, &recov_len, &type);
+ if (res != 0) {
+ /* key does not exist.
+ */
+ return WD_RESOURCE_NOT_MONITORED;
+ }
+ res = api->object_key_get_typed (resource,
+ "state", (void*)&state, &state_len, &type);
+ if (res != 0) {
+ /* key does not exist.
+ */
+ return WD_RESOURCE_NOT_MONITORED;
+ }
+
+ if (strcmp (recov, "watchdog") == 0) {
+ if (strcmp (state, "failed") == 0) {
+ return WD_RESOURCE_FAILED;
+ }
+ else if (strcmp (state, "good") == 0) {
+ return WD_RESOURCE_GOOD;
+ }
+ else {
+ return WD_RESOURCE_UNKNOWN;
+ }
+ }
+ return WD_RESOURCE_NOT_MONITORED;
+}
+
+
+static void wd_tickle_fn (void* arg)
+{
+ hdb_handle_t parent;
+ hdb_handle_t obj_finder;
+ hdb_handle_t obj_finder2;
+ hdb_handle_t resources;
+ hdb_handle_t resource_type;
+ hdb_handle_t resource;
+ int res;
+ char object_name[128];
+ size_t object_name_len;
+ char object_name2[128];
+ size_t object_name2_len;
+ int all_ok = 1;
+ wd_resource_state_t state;
+
+ parent = OBJECT_PARENT_HANDLE;
+ api->object_find_create (
+ OBJECT_PARENT_HANDLE,
+ "resources", strlen ("resources"),
+ &obj_finder);
+
+ res = api->object_find_next (obj_finder, &resources);
+ api->object_find_destroy (obj_finder);
+ if (res != 0) {
+ return;
+ }
+
+ /* this will be the system or processes level
+ */
+ api->object_find_create (
+ resources,
+ NULL, 0,
+ &obj_finder);
+ while (api->object_find_next (obj_finder,
+ &resource_type) == 0) {
+ api->object_name_get (resource_type,
+ object_name,
+ &object_name_len);
+
+ api->object_find_create (
+ resource_type,
+ NULL, 0,
+ &obj_finder2);
+
+ while (api->object_find_next (obj_finder2,
+ &resource) == 0) {
+
+ api->object_name_get (resource,
+ object_name2,
+ &object_name2_len);
+
+ state = resource_state_get (resource);
+ if (state == WD_RESOURCE_FAILED) {
+ all_ok = 0;
+ log_printf (LOGSYS_LEVEL_CRIT,
+ "/resources/%s/%s failed!",
+ (char*)object_name, (char*)object_name2);
+ }
+ else if (state == WD_RESOURCE_GOOD) {
+ log_printf (LOGSYS_LEVEL_INFO,
+ "/resources/%s/%s good.",
+ (char*)object_name, (char*)object_name2);
+ }
+ else {
+ log_printf (LOGSYS_LEVEL_INFO,
+ "/resources/%s/%s not monitored.",
+ (char*)object_name, (char*)object_name2);
+ }
+ }
+ api->object_find_destroy (obj_finder2);
+ }
+ api->object_find_destroy (obj_finder);
+
+ if (all_ok) {
+ /* tickle */
+ ioctl(dog, WDIOC_KEEPALIVE, &all_ok);
+
+ log_printf (LOGSYS_LEVEL_INFO,
+ "all watchdog'ed resources are good.");
+ }
+ else {
+ log_printf (LOGSYS_LEVEL_ALERT,
+ "all watchdog'ed resources are NOT good, NOT tickling the watchdog!");
+ }
+
+ api->timer_add_duration((unsigned long long)wd_timeout*1000000000, NULL,
+ wd_tickle_fn, &wd_timer);
+}
+
+static int num_resources_need_watchdog(void)
+{
+ hdb_handle_t parent;
+ hdb_handle_t obj_finder;
+ hdb_handle_t obj_finder2;
+ hdb_handle_t resources;
+ hdb_handle_t resource_type;
+ hdb_handle_t resource;
+ int res;
+ int number = 0;
+
+ parent = OBJECT_PARENT_HANDLE;
+ api->object_find_create (
+ OBJECT_PARENT_HANDLE,
+ "resources", strlen ("resources"),
+ &obj_finder);
+
+ res = api->object_find_next (obj_finder, &resources);
+ api->object_find_destroy (obj_finder);
+ if (res != 0) {
+ return number;
+ }
+
+ /* this will be the system or processes level
+ */
+ api->object_find_create (
+ resources,
+ NULL, 0,
+ &obj_finder);
+ while (api->object_find_next (obj_finder,
+ &resource_type) == 0) {
+
+ api->object_find_create (
+ resource_type,
+ NULL, 0,
+ &obj_finder2);
+
+ while (api->object_find_next (obj_finder2,
+ &resource) == 0) {
+ if (resource_state_get (resource) != WD_RESOURCE_NOT_MONITORED) {
+ number++;
+ }
+ }
+ api->object_find_destroy (obj_finder2);
+ }
+ api->object_find_destroy (obj_finder);
+
+ return number;
+}
+
+static int setup_watchdog(void)
+{
+ struct watchdog_info ident;
+
+ if (access ("/dev/watchdog", W_OK) != 0) {
+ log_printf (LOGSYS_LEVEL_WARNING, "No Watchdog, try modprobe <a watchdog>");
+ return -1;
+ }
+
+ /* here goes, lets hope they have "Magic Close"
+ */
+ dog = open("/dev/watchdog", O_WRONLY);
+
+ if (dog == -1) {
+ log_printf (LOGSYS_LEVEL_WARNING, "Watchdog exists but couldn't be opened.");
+ return -1;
+ }
+
+ /* Right we have the dog.
+ * Lets see what breed it is.
+ */
+
+ ioctl(dog, WDIOC_GETSUPPORT, &ident);
+ log_printf (LOGSYS_LEVEL_INFO, "Watchdog is now been tickled by corosync.");
+ log_printf (LOGSYS_LEVEL_DEBUG, "%s", ident.identity);
+
+ if (ident.options & WDIOF_SETTIMEOUT) {
+ /* yay! the dog is trained.
+ */
+ /* TODO set this up from a config option */
+ }
+ ioctl(dog, WDIOC_GETTIMEOUT, &wd_timeout);
+ log_printf (LOGSYS_LEVEL_DEBUG, "The timeout is %d seconds\n", wd_timeout);
+ wd_timeout = wd_timeout / 2;
+
+ ioctl(dog, WDIOC_SETOPTIONS, WDIOS_ENABLECARD);
+ return 0;
+}
+
+static int wd_exec_init_fn (
+ struct corosync_api_v1 *corosync_api)
+{
+
+#ifdef COROSYNC_SOLARIS
+ logsys_subsys_init();
+#endif
+ log_printf (LOGSYS_LEVEL_INFO, "%s\n", __func__);
+ api = corosync_api;
+
+ if (num_resources_need_watchdog() == 0)
+ return -1;
+
+ /* this will setup wd_timeout */
+ if (setup_watchdog() != 0)
+ return -1;
+
+ api->timer_add_duration((unsigned long long)wd_timeout*1000000000, NULL,
+ wd_tickle_fn, &wd_timer);
+
+ return 0;
+}
+
+static int wd_exec_exit_fn (void)
+{
+ char magic = 'V';
+ if (dog > 0) {
+ log_printf (LOGSYS_LEVEL_INFO, "%s: magic close.\n", __func__);
+ write (dog, &magic, 1);
+ }
+ return 0;
+}
+
+static void wd_confchg_fn (
+ enum totem_configuration_type configuration_type,
+ const unsigned int *member_list, size_t member_list_entries,
+ const unsigned int *left_list, size_t left_list_entries,
+ const unsigned int *joined_list, size_t joined_list_entries,
+ const struct memb_ring_id *ring_id)
+{
+}
+
+static int wd_lib_init_fn (void *conn)
+{
+ return (0);
+}
+
+static int wd_lib_exit_fn (void *conn)
+{
+ return (0);
+}
+
--
1.6.6.1
More information about the Openais
mailing list