master - gfs_controld: kill the cluster on misbehaving nodes
David Teigland
teigland@fedoraproject.org
Tue Aug 19 21:20:00 GMT 2008
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=34679f895b267f6efdfa83a63aa65d09e631d5ed
Commit: 34679f895b267f6efdfa83a63aa65d09e631d5ed
Parent: 3c6d20abb8feac1d3756a13bf98e062a385b5f10
Author: David Teigland <teigland@redhat.com>
AuthorDate: Tue Aug 19 15:40:50 2008 -0500
Committer: David Teigland <teigland@redhat.com>
CommitterDate: Tue Aug 19 15:48:49 2008 -0500
gfs_controld: kill the cluster on misbehaving nodes
Kill cman on other nodes where the gfs_controld process fails.
Shutdown cman locally if we find uncontrolled filesystems at startup.
Signed-off-by: David Teigland <teigland@redhat.com>
---
group/gfs_controld/cpg-new.c | 3 ++
group/gfs_controld/gfs_daemon.h | 2 +
group/gfs_controld/main.c | 7 ++--
group/gfs_controld/member_cman.c | 23 +++++++++++-
group/gfs_controld/util.c | 73 ++++++++++++++++++++++++++++++++++++++
5 files changed, 103 insertions(+), 5 deletions(-)
diff --git a/group/gfs_controld/cpg-new.c b/group/gfs_controld/cpg-new.c
index 28445c1..579de1e 100644
--- a/group/gfs_controld/cpg-new.c
+++ b/group/gfs_controld/cpg-new.c
@@ -2135,6 +2135,9 @@ static int add_change(struct mountgroup *mg,
log_group(mg, "add_change %u nodeid %d remove reason %d",
cg->seq, memb->nodeid, left_list[i].reason);
+
+ if (left_list[i].reason == CPG_REASON_PROCDOWN)
+ kick_node_from_cluster(memb->nodeid);
}
for (i = 0; i < joined_list_entries; i++) {
diff --git a/group/gfs_controld/gfs_daemon.h b/group/gfs_controld/gfs_daemon.h
index 8d5c2bd..3cda467 100644
--- a/group/gfs_controld/gfs_daemon.h
+++ b/group/gfs_controld/gfs_daemon.h
@@ -287,6 +287,7 @@ void cluster_dead(int ci);
int setup_cman(void);
void close_cman(void);
void process_cman(int ci);
+void kick_node_from_cluster(int nodeid);
/* plock.c */
int setup_plocks(void);
@@ -310,6 +311,7 @@ int read_sysfs_int(struct mountgroup *mg, char *field, int *val_out);
int run_dmsetup_suspend(struct mountgroup *mg, char *dev);
void update_dmsetup_wait(void);
void update_flow_control_status(void);
+int check_uncontrolled_filesystems(void);
/* logging.c */
diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c
index 01de429..a9f20eb 100644
--- a/group/gfs_controld/main.c
+++ b/group/gfs_controld/main.c
@@ -1019,9 +1019,6 @@ static void loop(void)
void (*workfn) (int ci);
void (*deadfn) (int ci);
- /* FIXME: add code that looks for uncontrolled instances of
- gfs filesystems in the kernel */
-
rv = setup_queries();
if (rv < 0)
goto out;
@@ -1042,6 +1039,10 @@ static void loop(void)
setup_logging();
+ rv = check_uncontrolled_filesystems();
+ if (rv < 0)
+ goto out;
+
rv = setup_uevent();
if (rv < 0)
goto out;
diff --git a/group/gfs_controld/member_cman.c b/group/gfs_controld/member_cman.c
index f8570c1..0375c7c 100644
--- a/group/gfs_controld/member_cman.c
+++ b/group/gfs_controld/member_cman.c
@@ -3,8 +3,21 @@
#include <libcman.h>
static cman_handle_t ch;
+static cman_handle_t ch_admin;
static cman_cluster_t cluster;
+void kick_node_from_cluster(int nodeid)
+{
+ if (!nodeid) {
+ log_error("telling cman to shut down cluster locally");
+ cman_shutdown(ch_admin, CMAN_SHUTDOWN_ANYWAY);
+ } else {
+ log_error("telling cman to remove nodeid %d from cluster",
+ nodeid);
+ cman_kill_node(ch_admin, nodeid);
+ }
+}
+
static void cman_callback(cman_handle_t h, void *private, int reason, int arg)
{
if (reason == CMAN_REASON_TRY_SHUTDOWN) {
@@ -33,12 +46,18 @@ int setup_cman(void)
int init = 0, active = 0;
retry_init:
- ch = cman_init(NULL);
- if (!ch) {
+ ch_admin = cman_admin_init(NULL);
+ if (!ch_admin) {
if (init++ < 2) {
sleep(1);
goto retry_init;
}
+ log_error("cman_admin_init error %d", errno);
+ return -ENOTCONN;
+ }
+
+ ch = cman_init(NULL);
+ if (!ch) {
log_error("cman_init error %d", errno);
return -ENOTCONN;
}
diff --git a/group/gfs_controld/util.c b/group/gfs_controld/util.c
index a0ac39c..da2f410 100644
--- a/group/gfs_controld/util.c
+++ b/group/gfs_controld/util.c
@@ -211,3 +211,76 @@ void update_dmsetup_wait(void)
}
}
+static int ignore_nolock(char *sysfs_dir, char *table)
+{
+ char path[PATH_MAX];
+ int fd;
+
+ memset(path, 0, PATH_MAX);
+
+ snprintf(path, PATH_MAX, "%s/%s/lock_module/proto_name",
+ sysfs_dir, table);
+
+ /* lock_nolock doesn't create the "lock_module" dir at all,
+ so we'll fail to open this */
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return 1;
+
+ close(fd);
+ return 0;
+}
+
+/* This is for the case where gfs_controld exits/fails, abandoning gfs
+ filesystems in the kernel, and then gfs_controld is restarted. When
+ gfs_controld exits and abandons lockspaces, that node needs to be
+ rebooted to clear the uncontrolled filesystems from the kernel. */
+
+int check_uncontrolled_filesystems(void)
+{
+ DIR *d;
+ struct dirent *de;
+ int count = 0;
+
+ d = opendir("/sys/fs/gfs/");
+ if (!d)
+ goto gfs2;
+
+ while ((de = readdir(d))) {
+ if (de->d_name[0] == '.')
+ continue;
+
+ if (ignore_nolock("/sys/fs/gfs/", de->d_name))
+ continue;
+
+ log_error("found uncontrolled gfs fs %s", de->d_name);
+ count++;
+ }
+ closedir(d);
+
+ gfs2:
+ d = opendir("/sys/fs/gfs2/");
+ if (!d)
+ goto out;
+
+ while ((de = readdir(d))) {
+ if (de->d_name[0] == '.')
+ continue;
+
+ if (ignore_nolock("/sys/fs/gfs2/", de->d_name))
+ continue;
+
+ log_error("found uncontrolled gfs2 fs %s", de->d_name);
+ count++;
+ }
+ closedir(d);
+
+ out:
+ if (count) {
+ kick_node_from_cluster(our_nodeid);
+ return -1;
+ }
+ return 0;
+}
+
More information about the Cluster-cvs
mailing list