master - gfs_controld: kill the cluster on misbehaving nodes

David Teigland teigland@fedoraproject.org
Tue Aug 19 21:20:00 GMT 2008


Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=34679f895b267f6efdfa83a63aa65d09e631d5ed
Commit:        34679f895b267f6efdfa83a63aa65d09e631d5ed
Parent:        3c6d20abb8feac1d3756a13bf98e062a385b5f10
Author:        David Teigland <teigland@redhat.com>
AuthorDate:    Tue Aug 19 15:40:50 2008 -0500
Committer:     David Teigland <teigland@redhat.com>
CommitterDate: Tue Aug 19 15:48:49 2008 -0500

gfs_controld: kill the cluster on misbehaving nodes

Kill cman on other nodes where the gfs_controld process fails.
Shutdown cman locally if we find uncontrolled filesystems at startup.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 group/gfs_controld/cpg-new.c     |    3 ++
 group/gfs_controld/gfs_daemon.h  |    2 +
 group/gfs_controld/main.c        |    7 ++--
 group/gfs_controld/member_cman.c |   23 +++++++++++-
 group/gfs_controld/util.c        |   73 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 103 insertions(+), 5 deletions(-)

diff --git a/group/gfs_controld/cpg-new.c b/group/gfs_controld/cpg-new.c
index 28445c1..579de1e 100644
--- a/group/gfs_controld/cpg-new.c
+++ b/group/gfs_controld/cpg-new.c
@@ -2135,6 +2135,9 @@ static int add_change(struct mountgroup *mg,
 
 		log_group(mg, "add_change %u nodeid %d remove reason %d",
 			  cg->seq, memb->nodeid, left_list[i].reason);
+
+		if (left_list[i].reason == CPG_REASON_PROCDOWN)
+			kick_node_from_cluster(memb->nodeid);
 	}
 
 	for (i = 0; i < joined_list_entries; i++) {
diff --git a/group/gfs_controld/gfs_daemon.h b/group/gfs_controld/gfs_daemon.h
index 8d5c2bd..3cda467 100644
--- a/group/gfs_controld/gfs_daemon.h
+++ b/group/gfs_controld/gfs_daemon.h
@@ -287,6 +287,7 @@ void cluster_dead(int ci);
 int setup_cman(void);
 void close_cman(void);
 void process_cman(int ci);
+void kick_node_from_cluster(int nodeid);
 
 /* plock.c */
 int setup_plocks(void);
@@ -310,6 +311,7 @@ int read_sysfs_int(struct mountgroup *mg, char *field, int *val_out);
 int run_dmsetup_suspend(struct mountgroup *mg, char *dev);
 void update_dmsetup_wait(void);
 void update_flow_control_status(void);
+int check_uncontrolled_filesystems(void);
 
 /* logging.c */
 
diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c
index 01de429..a9f20eb 100644
--- a/group/gfs_controld/main.c
+++ b/group/gfs_controld/main.c
@@ -1019,9 +1019,6 @@ static void loop(void)
 	void (*workfn) (int ci);
 	void (*deadfn) (int ci);
 
-	/* FIXME: add code that looks for uncontrolled instances of
-	   gfs filesystems in the kernel */
-
 	rv = setup_queries();
 	if (rv < 0)
 		goto out;
@@ -1042,6 +1039,10 @@ static void loop(void)
 
 	setup_logging();
 
+	rv = check_uncontrolled_filesystems();
+	if (rv < 0)
+		goto out;
+
 	rv = setup_uevent();
 	if (rv < 0)
 		goto out;
diff --git a/group/gfs_controld/member_cman.c b/group/gfs_controld/member_cman.c
index f8570c1..0375c7c 100644
--- a/group/gfs_controld/member_cman.c
+++ b/group/gfs_controld/member_cman.c
@@ -3,8 +3,21 @@
 #include <libcman.h>
 
 static cman_handle_t ch;
+static cman_handle_t ch_admin;
 static cman_cluster_t cluster;
 
+void kick_node_from_cluster(int nodeid)
+{
+	if (!nodeid) {
+		log_error("telling cman to shut down cluster locally");
+		cman_shutdown(ch_admin, CMAN_SHUTDOWN_ANYWAY);
+	} else {
+		log_error("telling cman to remove nodeid %d from cluster",
+			  nodeid);
+		cman_kill_node(ch_admin, nodeid);
+	}
+}
+
 static void cman_callback(cman_handle_t h, void *private, int reason, int arg)
 {
 	if (reason == CMAN_REASON_TRY_SHUTDOWN) {
@@ -33,12 +46,18 @@ int setup_cman(void)
 	int init = 0, active = 0;
 
  retry_init:
-	ch = cman_init(NULL);
-	if (!ch) {
+	ch_admin = cman_admin_init(NULL);
+	if (!ch_admin) {
 		if (init++ < 2) {
 			sleep(1);
 			goto retry_init;
 		}
+		log_error("cman_admin_init error %d", errno);
+		return -ENOTCONN;
+	}
+
+	ch = cman_init(NULL);
+	if (!ch) {
 		log_error("cman_init error %d", errno);
 		return -ENOTCONN;
 	}
diff --git a/group/gfs_controld/util.c b/group/gfs_controld/util.c
index a0ac39c..da2f410 100644
--- a/group/gfs_controld/util.c
+++ b/group/gfs_controld/util.c
@@ -211,3 +211,76 @@ void update_dmsetup_wait(void)
 	}
 }
 
+static int ignore_nolock(char *sysfs_dir, char *table)
+{
+	char path[PATH_MAX];
+	int fd;
+
+	memset(path, 0, PATH_MAX);
+
+	snprintf(path, PATH_MAX, "%s/%s/lock_module/proto_name",
+		 sysfs_dir, table);
+
+	/* lock_nolock doesn't create the "lock_module" dir at all,
+	   so we'll fail to open this */
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return 1;
+
+	close(fd);
+	return 0;
+}
+
+/* This is for the case where gfs_controld exits/fails, abandoning gfs
+   filesystems in the kernel, and then gfs_controld is restarted.  When
+   gfs_controld exits and abandons lockspaces, that node needs to be
+   rebooted to clear the uncontrolled filesystems from the kernel. */
+
+int check_uncontrolled_filesystems(void)
+{
+	DIR *d;
+	struct dirent *de;
+	int count = 0;
+
+	d = opendir("/sys/fs/gfs/");
+	if (!d)
+		goto gfs2;
+
+	while ((de = readdir(d))) {
+		if (de->d_name[0] == '.')
+			continue;
+
+		if (ignore_nolock("/sys/fs/gfs/", de->d_name))
+			continue;
+
+		log_error("found uncontrolled gfs fs %s", de->d_name);
+		count++;
+	}
+	closedir(d);
+
+ gfs2:
+	d = opendir("/sys/fs/gfs2/");
+	if (!d)
+		goto out;
+
+	while ((de = readdir(d))) {
+		if (de->d_name[0] == '.')
+			continue;
+
+		if (ignore_nolock("/sys/fs/gfs2/", de->d_name))
+			continue;
+
+		log_error("found uncontrolled gfs2 fs %s", de->d_name);
+		count++;
+	}
+	closedir(d);
+
+ out:
+	if (count) {
+		kick_node_from_cluster(our_nodeid);
+		return -1;
+	}
+	return 0;
+}
+



More information about the Cluster-cvs mailing list