STABLE2 - groupd: detect dead daemons and remove node from cluster

Ryan O'Hara rohara@fedoraproject.org
Mon Sep 29 13:31:00 GMT 2008


Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=07aaff32a7abf3533bf560e5d2641b893c4bbd80
Commit:        07aaff32a7abf3533bf560e5d2641b893c4bbd80
Parent:        2468574b5056b45b26787db1e89664abefab043c
Author:        Ryan O'Hara <rohara@redhat.com>
AuthorDate:    Tue Sep 9 09:57:17 2008 -0500
Committer:     Ryan O'Hara <rohara@redhat.com>
CommitterDate: Mon Sep 29 08:29:25 2008 -0500

groupd: detect dead daemons and remove node from cluster

If any of the daemons that run within groupd fail unexpectedly, we
detect this failure and remove the node from the cluster. These
daemons include fenced, dlm_controld, and gfs_controld. If any of
these daemons die unexpectedly (or are killed), the cluster in
in an invalid state, so the proper thing to do is remove the node
from the cluster (cman_leave_cluster).

This behavior is enabled by default, but can be turned off with
the -s option for groupd. For example, 'groupd -s0' will disable
this "shutdown mode". (BZ #318571)
---
 group/daemon/cman.c        |    4 ++++
 group/daemon/gd_internal.h |    2 ++
 group/daemon/main.c        |   44 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 49 insertions(+), 1 deletions(-)

diff --git a/group/daemon/cman.c b/group/daemon/cman.c
index 1b44269..998197b 100644
--- a/group/daemon/cman.c
+++ b/group/daemon/cman.c
@@ -13,6 +13,10 @@ static cman_node_t	cman_nodes[MAX_NODES];
 static int		cman_node_count;
 static char		name_buf[CMAN_MAX_NODENAME_LEN+1];
 
+int shutdown_cman(void)
+{
+	cman_leave_cluster(ch_admin, CMAN_LEAVEFLAG_FORCE);
+}
 
 int kill_cman(int nodeid)
 {
diff --git a/group/daemon/gd_internal.h b/group/daemon/gd_internal.h
index b56dd92..bb6e284 100644
--- a/group/daemon/gd_internal.h
+++ b/group/daemon/gd_internal.h
@@ -39,6 +39,7 @@
 extern char *prog_name;
 extern int groupd_debug_opt;
 extern int groupd_debug_verbose;
+extern int groupd_shutdown_opt;
 extern char groupd_debug_buf[256];
 extern char dump_buf[DUMP_SIZE];
 extern int dump_point;
@@ -266,6 +267,7 @@ void client_dead(int ci);
 /* cman.c */
 int setup_cman(void);
 int kill_cman(int nodeid);
+int shutdown_cman(void);
 int set_cman_dirty(void);
 
 /* cpg.c */
diff --git a/group/daemon/main.c b/group/daemon/main.c
index 8e278c5..18b1e50 100644
--- a/group/daemon/main.c
+++ b/group/daemon/main.c
@@ -3,7 +3,7 @@
 
 #include "gd_internal.h"
 
-#define OPTION_STRING			"DhVv"
+#define OPTION_STRING			"Dhs:Vv"
 #define LOCKFILE_NAME			"/var/run/groupd.pid"
 #define LOG_FILE				"/var/log/groupd.log"
 
@@ -147,6 +147,37 @@ static int kernel_instance_count(char *sysfs_dir)
 	return rv;
 }
 
+int check_dead_daemons(int ci)
+{
+	group_t *g;
+
+	if (strncmp(client[ci].type, "fence", 5) == 0) {
+		list_for_each_entry(g, &gd_groups, list) {
+			if (client[ci].level == g->level) {
+				return 1;
+			}
+		}
+	}
+
+	if (strncmp(client[ci].type, "dlm", 3) == 0) {
+		list_for_each_entry(g, &gd_groups, list) {
+			if (client[ci].level == g->level) {
+				return 1;
+			}
+		}
+	}
+
+	if (strncmp(client[ci].type, "gfs", 3) == 0) {
+		list_for_each_entry(g, &gd_groups, list) {
+			if (client[ci].level == g->level) {
+				return 1;
+			}
+		}
+	}
+
+	return 0;
+}
+
 int check_uncontrolled_groups(void)
 {
 	pid_t pid;
@@ -421,6 +452,11 @@ static void client_alloc(void)
 
 void client_dead(int ci)
 {
+	if (groupd_shutdown_opt && check_dead_daemons(ci)) {
+		log_print("%s daemon appears to be dead", client[ci].type);
+		shutdown_cman();
+	}
+
 	close(client[ci].fd);
 	client[ci].workfn = NULL;
 	client[ci].fd = -1;
@@ -870,6 +906,7 @@ static void print_usage(void)
 	printf("\n");
 	printf("  -D	       Enable debugging code and don't fork\n");
 	printf("  -h	       Print this help, then exit\n");
+	printf("  -s [0|1]     Enable (or disable) shutdown mode\n");
 	printf("  -V	       Print program version information, then exit\n");
 }
 
@@ -892,6 +929,10 @@ static void decode_arguments(int argc, char **argv)
 			exit(EXIT_SUCCESS);
 			break;
 
+		case 's':
+			groupd_shutdown_opt = atoi(optarg);
+			break;
+
 		case 'v':
 			groupd_debug_verbose++;
 			break;
@@ -1021,6 +1062,7 @@ void groupd_dump_save(void)
 char *prog_name;
 int groupd_debug_opt;
 int groupd_debug_verbose;
+int groupd_shutdown_opt = 1;
 char groupd_debug_buf[256];
 char dump_buf[DUMP_SIZE];
 int dump_point;



More information about the Cluster-cvs mailing list