master - rgmanager: Detect restricted failover domain crash

Lon Hohberger lon@fedoraproject.org
Wed Sep 24 18:03:00 GMT 2008


Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=63f9fc14ecee65b107bc0fe2d8c745839a24fbd0
Commit:        63f9fc14ecee65b107bc0fe2d8c745839a24fbd0
Parent:        69c480c5114618d63dfadd6ce7a6db28d4c149b2
Author:        Lon Hohberger <lhh@redhat.com>
AuthorDate:    Mon Sep 8 11:52:33 2008 -0400
Committer:     Lon Hohberger <lhh@redhat.com>
CommitterDate: Wed Sep 24 13:39:28 2008 -0400

rgmanager: Detect restricted failover domain crash

Mark service as 'stopped' when it is 'running' but the
node is down.  rhbz #435466
---
 rgmanager/include/members.h                     |    1 +
 rgmanager/include/reslist.h                     |    2 +-
 rgmanager/src/clulib/members.c                  |   29 ++++++++
 rgmanager/src/daemons/fo_domain.c               |   17 ++++-
 rgmanager/src/daemons/groups.c                  |   87 ++++++++++++++++------
 rgmanager/src/daemons/rg_state.c                |   17 ++++-
 rgmanager/src/daemons/service_op.c              |   13 +++-
 rgmanager/src/daemons/slang_event.c             |   18 +----
 rgmanager/src/resources/default_event_script.sl |    3 +-
 9 files changed, 140 insertions(+), 47 deletions(-)

diff --git a/rgmanager/include/members.h b/rgmanager/include/members.h
index 08feee0..5aa9f1e 100644
--- a/rgmanager/include/members.h
+++ b/rgmanager/include/members.h
@@ -17,6 +17,7 @@ void free_member_list(cluster_member_list_t *ml);
 void member_set_state(int nodeid, int state);
 int memb_count(cluster_member_list_t *ml);
 int member_online(int nodeid);
+int member_online_set(int **nodes, int *nodecount);
 int memb_online(cluster_member_list_t *ml, int nodeid);
 int memb_online_name(cluster_member_list_t *ml, char *name);
 int memb_name_to_id(cluster_member_list_t *ml, char *name);
diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h
index 7440341..7b0934a 100644
--- a/rgmanager/include/reslist.h
+++ b/rgmanager/include/reslist.h
@@ -184,7 +184,7 @@ void deconstruct_domains(fod_t **domains);
 void print_domains(fod_t **domains);
 int node_should_start(int nodeid, cluster_member_list_t *membership,
 		      char *rg_name, fod_t **domains);
-int node_domain_set(fod_t *domain, int **ret, int *retlen);
+int node_domain_set(fod_t **domains, char *name, int **ret, int *retlen, int *flags);
 int node_domain_set_safe(char *domainname, int **ret, int *retlen, int *flags);
 
 
diff --git a/rgmanager/src/clulib/members.c b/rgmanager/src/clulib/members.c
index ee7f1fe..fb77cea 100644
--- a/rgmanager/src/clulib/members.c
+++ b/rgmanager/src/clulib/members.c
@@ -195,6 +195,35 @@ member_list(void)
 }
 
 
+int
+member_online_set(int **nodes, int *nodecount)
+{
+	int ret = 1, i;
+
+	pthread_rwlock_rdlock(&memblock);
+	if (!membership)
+		goto out_unlock;
+
+	*nodes = malloc(sizeof(int) * membership->cml_count);
+	if (!*nodes)
+		goto out_unlock;
+
+	*nodecount = 0;
+	for (i = 0; i < membership->cml_count; i++) {
+		if (membership->cml_members[i].cn_member &&
+		    membership->cml_members[i].cn_nodeid != 0) {
+			(*nodes)[*nodecount] = membership->cml_members[i].cn_nodeid;
+			++(*nodecount);
+		}
+	}
+
+	ret = 0;
+out_unlock:
+	pthread_rwlock_unlock(&memblock);
+	return ret;
+}
+
+
 void
 member_set_state(int nodeid, int state)
 {
diff --git a/rgmanager/src/daemons/fo_domain.c b/rgmanager/src/daemons/fo_domain.c
index c17b0e3..97f244c 100644
--- a/rgmanager/src/daemons/fo_domain.c
+++ b/rgmanager/src/daemons/fo_domain.c
@@ -347,13 +347,24 @@ node_in_domain(char *nodename, fod_t *domain,
 
 
 int
-node_domain_set(fod_t *domain, int **ret, int *retlen)
+node_domain_set(fod_t **domains, char *name, int **ret, int *retlen, int *flags)
 {
 	int x, i, j;
 	int *tmpset;
 	int ts_count;
-
 	fod_node_t *fodn;
+	fod_t *domain;
+	int rv = -1, found = 0;
+
+	list_for(domains, domain, x) {
+		if (!strcasecmp(domain->fd_name, name)) {
+			found = 1;
+			break;
+		}
+	} // while (!list_done(&_domains, fod));
+
+	if (!found)
+		return -1;
 
 	/* Count domain length */
 	list_for(&domain->fd_nodes, fodn, x) { }
@@ -366,6 +377,8 @@ node_domain_set(fod_t *domain, int **ret, int *retlen)
 	if (!(*tmpset))
 		return -1;
 
+	*flags = domain->fd_flags;
+
 	if (domain->fd_flags & FOD_ORDERED) {
 		for (i = 1; i <= 100; i++) {
 			
diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c
index 91c5fae..3927479 100644
--- a/rgmanager/src/daemons/groups.c
+++ b/rgmanager/src/daemons/groups.c
@@ -39,7 +39,8 @@ pthread_rwlock_t resource_lock = PTHREAD_RWLOCK_INITIALIZER;
 
 void res_build_name(char *, size_t, resource_t *);
 int get_rg_state_local(char *, rg_state_t *);
-int group_migratory(char *, int);
+int group_migratory(char *groupname, int lock);
+int _group_property(char *groupname, char *property, char *ret, size_t len);
 
 
 struct status_arg {
@@ -70,23 +71,9 @@ node_should_start_safe(uint32_t nodeid, cluster_member_list_t *membership,
 int
 node_domain_set_safe(char *domainname, int **ret, int *retlen, int *flags)
 {
-	fod_t *fod;
-	int rv = -1, found = 0, x = 0;
-
+	int rv = 0;
 	pthread_rwlock_rdlock(&resource_lock);
-
-	list_for(&_domains, fod, x) {
-		if (!strcasecmp(fod->fd_name, domainname)) {
-			found = 1;
-			break;
-		}
-	} // while (!list_done(&_domains, fod));
-
-	if (found) {
-		rv = node_domain_set(fod, ret, retlen);
-		*flags = fod->fd_flags;
-	}
-
+	rv = node_domain_set(&_domains, domainname, ret, retlen, flags);
 	pthread_rwlock_unlock(&resource_lock);
 
 	return rv;
@@ -420,6 +407,47 @@ check_depend_safe(char *rg_name)
 }
 
 
+int
+check_rdomain_crash(char *svcName)
+{
+	int *nodes = NULL, nodecount;
+	int *fd_nodes = NULL, fd_nodecount, fl;
+	int *isect = NULL, icount;
+	char fd_name[256];
+
+	if (_group_property(svcName, "domain", fd_name, sizeof(fd_name)) != 0)
+		goto out_free;
+
+	if (node_domain_set(_domains, fd_name, &fd_nodes,
+			    &fd_nodecount, &fl) != 0)
+		goto out_free;
+
+	if (!(fl & FOD_RESTRICTED))
+		goto out_free;
+	
+	if (s_intersection(fd_nodes, fd_nodecount, nodes, nodecount, 
+		    &isect, &icount) < 0)
+		goto out_free;
+
+	if (icount == 0) {
+		clulog(LOG_NOTICE, "Marking %s as stopped: "
+		       "Restricted domain unavailable\n", svcName);
+		rt_enqueue_request(svcName, RG_STOP, NULL, 0, 0,
+				   0, 0);
+	}
+
+out_free:
+	if (fd_nodes)
+		free(fd_nodes);
+	if (nodes)
+		free(nodes);
+	if (isect)
+		free(isect);
+
+	return 0;
+}
+
+
 /**
   Start or failback a resource group: if it's not running, start it.
   If it is running and we're a better member to run it, then ask for
@@ -433,6 +461,7 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus,
 	cman_node_t *mp;
 	int autostart, exclusive;
 	struct dlm_lksb lockp;
+	int fod_ret;
 
 	mp = memb_id_to_p(membership, my_id());
 	assert(mp);
@@ -527,10 +556,13 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus,
 	 * Start any stopped services, or started services
 	 * that are owned by a down node.
 	 */
-	if (node_should_start(mp->cn_nodeid, membership, svcName, &_domains) ==
-	    FOD_BEST)
+	fod_ret = node_should_start(mp->cn_nodeid, membership,
+				    svcName, &_domains);
+	if (fod_ret == FOD_BEST)
 		rt_enqueue_request(svcName, RG_START, NULL, 0, mp->cn_nodeid,
 				   0, 0);
+	else if (fod_ret == FOD_ILLEGAL)
+		check_rdomain_crash(svcName);
 }
 
 
@@ -1045,15 +1077,13 @@ out:
    @return		0 on success, -1 on failure.
  */
 int
-group_property(char *groupname, char *property, char *ret, size_t len)
+_group_property(char *groupname, char *property, char *ret, size_t len)
 {
 	resource_t *res = NULL;
 	int x = 0;
 
-	pthread_rwlock_rdlock(&resource_lock);
 	res = find_root_by_ref(&_resources, groupname);
 	if (!res) {
-		pthread_rwlock_unlock(&resource_lock);
 		return -1;
 	}
 
@@ -1061,15 +1091,24 @@ group_property(char *groupname, char *property, char *ret, size_t len)
 		if (strcasecmp(res->r_attrs[x].ra_name, property))
 			continue;
 		strncpy(ret, res->r_attrs[x].ra_value, len);
-		pthread_rwlock_unlock(&resource_lock);
 		return 0;
 	}
-	pthread_rwlock_unlock(&resource_lock);
 
 	return -1;
 }
 
 
+int
+group_property(char *groupname, char *property, char *ret_val, size_t len)
+{
+	int ret = -1;
+	pthread_rwlock_rdlock(&resource_lock);
+	ret = _group_property(groupname, property, ret_val, len);
+	pthread_rwlock_unlock(&resource_lock);
+	return ret;
+}
+	
+
 /**
   Send the state of a resource group to a given file descriptor.
 
diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c
index 8a205ba..c57b148 100644
--- a/rgmanager/src/daemons/rg_state.c
+++ b/rgmanager/src/daemons/rg_state.c
@@ -463,6 +463,8 @@ get_rg_state_local(char *name, rg_state_t *svcblk)
  *                      3 = DO NOT stop service, return RG_EFORWARD
  *			4 = DO NOT stop service, return RG_EAGAIN
  *			5 = DO NOT stop service, return RG_EFROZEN
+ *			6 = DO NOT stop service, mark stopped and return
+ *			    RG_SUCCESS (0)
  */
 int
 svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req)
@@ -527,9 +529,10 @@ svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req)
 
 		/*
 		   Service is marked as running but node is down.
-		   Doesn't make much sense to stop it.
+		   Doesn't make much sense to stop it - but we need
+		   to mark it stopped
 		 */
-		ret = 2;
+		ret = 6;
 		break;
 
 	case RG_STATE_ERROR:
@@ -1298,6 +1301,16 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate)
 		clulog(LOG_DEBUG, "Unable to stop RG %s in %s state\n",
 		       svcName, rg_state_str(svcStatus.rs_state));
 		return RG_EFAIL;
+	case 6:
+		/* Mark stopped, but do not do anything */
+		svcStatus.rs_last_owner = svcStatus.rs_owner;
+		svcStatus.rs_owner = 0;
+		svcStatus.rs_state = RG_STATE_STOPPED;
+		if (set_rg_state(svcName, &svcStatus) != 0) {
+			rg_unlock(&lockp);
+			return RG_EFAIL;
+		}
+		/* FALLTHROUGH */
 	case 2:
 		rg_unlock(&lockp);
 		return RG_ESUCCESS;
diff --git a/rgmanager/src/daemons/service_op.c b/rgmanager/src/daemons/service_op.c
index 653e988..855975f 100644
--- a/rgmanager/src/daemons/service_op.c
+++ b/rgmanager/src/daemons/service_op.c
@@ -133,8 +133,17 @@ service_op_stop(char *svcName, int do_disable, int event_type)
 
 	if (get_service_state_internal(svcName, &svcStatus) < 0)
 		return RG_EFAIL;
-	if (svcStatus.rs_owner > 0)
-		msgtarget = svcStatus.rs_owner;
+	if (svcStatus.rs_owner > 0) {
+		if (member_online(svcStatus.rs_owner)) {
+			msgtarget = svcStatus.rs_owner;
+		} else {
+			/* If the owner is not online, 
+			   mark the service as 'stopped' but
+			   otherwise, do nothing.
+			 */
+			return svc_stop(svcName, RG_STOP);
+		}
+	}
 
 	if (msg_open(MSG_CLUSTER, msgtarget, RG_PORT, &ctx, 2)< 0) {
 		clulog(LOG_ERR,
diff --git a/rgmanager/src/daemons/slang_event.c b/rgmanager/src/daemons/slang_event.c
index a50b1af..737e01a 100644
--- a/rgmanager/src/daemons/slang_event.c
+++ b/rgmanager/src/daemons/slang_event.c
@@ -575,24 +575,12 @@ push_int_array(int *stuff, int len)
 void
 sl_nodes_online(void)
 {
-	int i, *nodes, nodecount = 0;
+	int x, *nodes = NULL, nodecount = 0;
 
-	cluster_member_list_t *membership = member_list();
-	if (!membership)
-		return;
-	nodes = malloc(sizeof(int) * membership->cml_count);
-	if (!nodes)
+	x = member_online_set(&nodes, &nodecount);
+	if (x < 0 || !nodes || !nodecount)
 		return;
 
-	nodecount = 0;
-	for (i = 0; i < membership->cml_count; i++) {
-		if (membership->cml_members[i].cn_member &&
-		    membership->cml_members[i].cn_nodeid != 0) {
-			nodes[nodecount] = membership->cml_members[i].cn_nodeid;
-			++nodecount;
-		}
-	}
-	free_member_list(membership);
 	push_int_array(nodes, nodecount);
 	free(nodes);
 }
diff --git a/rgmanager/src/resources/default_event_script.sl b/rgmanager/src/resources/default_event_script.sl
index 8e519fa..df9bce0 100644
--- a/rgmanager/src/resources/default_event_script.sl
+++ b/rgmanager/src/resources/default_event_script.sl
@@ -31,7 +31,8 @@ define move_or_start(service, node_list)
 
 	len = length(node_list);
 	if (len == 0) {
-		debug(service, " is not runnable");
+		notice(service, " is not runnable - restricted domain offline");
+		()=service_stop(service);
 		return ERR_DOMAIN;
 	}
 



More information about the Cluster-cvs mailing list