master - rgmanager: Detect restricted failover domain crash
Lon Hohberger
lon@fedoraproject.org
Wed Sep 24 18:03:00 GMT 2008
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=63f9fc14ecee65b107bc0fe2d8c745839a24fbd0
Commit: 63f9fc14ecee65b107bc0fe2d8c745839a24fbd0
Parent: 69c480c5114618d63dfadd6ce7a6db28d4c149b2
Author: Lon Hohberger <lhh@redhat.com>
AuthorDate: Mon Sep 8 11:52:33 2008 -0400
Committer: Lon Hohberger <lhh@redhat.com>
CommitterDate: Wed Sep 24 13:39:28 2008 -0400
rgmanager: Detect restricted failover domain crash
Mark service as 'stopped' when it is 'running' but the
node is down. rhbz #435466
---
rgmanager/include/members.h | 1 +
rgmanager/include/reslist.h | 2 +-
rgmanager/src/clulib/members.c | 29 ++++++++
rgmanager/src/daemons/fo_domain.c | 17 ++++-
rgmanager/src/daemons/groups.c | 87 ++++++++++++++++------
rgmanager/src/daemons/rg_state.c | 17 ++++-
rgmanager/src/daemons/service_op.c | 13 +++-
rgmanager/src/daemons/slang_event.c | 18 +----
rgmanager/src/resources/default_event_script.sl | 3 +-
9 files changed, 140 insertions(+), 47 deletions(-)
diff --git a/rgmanager/include/members.h b/rgmanager/include/members.h
index 08feee0..5aa9f1e 100644
--- a/rgmanager/include/members.h
+++ b/rgmanager/include/members.h
@@ -17,6 +17,7 @@ void free_member_list(cluster_member_list_t *ml);
void member_set_state(int nodeid, int state);
int memb_count(cluster_member_list_t *ml);
int member_online(int nodeid);
+int member_online_set(int **nodes, int *nodecount);
int memb_online(cluster_member_list_t *ml, int nodeid);
int memb_online_name(cluster_member_list_t *ml, char *name);
int memb_name_to_id(cluster_member_list_t *ml, char *name);
diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h
index 7440341..7b0934a 100644
--- a/rgmanager/include/reslist.h
+++ b/rgmanager/include/reslist.h
@@ -184,7 +184,7 @@ void deconstruct_domains(fod_t **domains);
void print_domains(fod_t **domains);
int node_should_start(int nodeid, cluster_member_list_t *membership,
char *rg_name, fod_t **domains);
-int node_domain_set(fod_t *domain, int **ret, int *retlen);
+int node_domain_set(fod_t **domains, char *name, int **ret, int *retlen, int *flags);
int node_domain_set_safe(char *domainname, int **ret, int *retlen, int *flags);
diff --git a/rgmanager/src/clulib/members.c b/rgmanager/src/clulib/members.c
index ee7f1fe..fb77cea 100644
--- a/rgmanager/src/clulib/members.c
+++ b/rgmanager/src/clulib/members.c
@@ -195,6 +195,35 @@ member_list(void)
}
+int
+member_online_set(int **nodes, int *nodecount)
+{
+ int ret = 1, i;
+
+ pthread_rwlock_rdlock(&memblock);
+ if (!membership)
+ goto out_unlock;
+
+ *nodes = malloc(sizeof(int) * membership->cml_count);
+ if (!*nodes)
+ goto out_unlock;
+
+ *nodecount = 0;
+ for (i = 0; i < membership->cml_count; i++) {
+ if (membership->cml_members[i].cn_member &&
+ membership->cml_members[i].cn_nodeid != 0) {
+ (*nodes)[*nodecount] = membership->cml_members[i].cn_nodeid;
+ ++(*nodecount);
+ }
+ }
+
+ ret = 0;
+out_unlock:
+ pthread_rwlock_unlock(&memblock);
+ return ret;
+}
+
+
void
member_set_state(int nodeid, int state)
{
diff --git a/rgmanager/src/daemons/fo_domain.c b/rgmanager/src/daemons/fo_domain.c
index c17b0e3..97f244c 100644
--- a/rgmanager/src/daemons/fo_domain.c
+++ b/rgmanager/src/daemons/fo_domain.c
@@ -347,13 +347,24 @@ node_in_domain(char *nodename, fod_t *domain,
int
-node_domain_set(fod_t *domain, int **ret, int *retlen)
+node_domain_set(fod_t **domains, char *name, int **ret, int *retlen, int *flags)
{
int x, i, j;
int *tmpset;
int ts_count;
-
fod_node_t *fodn;
+ fod_t *domain;
+ int rv = -1, found = 0;
+
+ list_for(domains, domain, x) {
+ if (!strcasecmp(domain->fd_name, name)) {
+ found = 1;
+ break;
+ }
+ } // while (!list_done(&_domains, fod));
+
+ if (!found)
+ return -1;
/* Count domain length */
list_for(&domain->fd_nodes, fodn, x) { }
@@ -366,6 +377,8 @@ node_domain_set(fod_t *domain, int **ret, int *retlen)
if (!(*tmpset))
return -1;
+ *flags = domain->fd_flags;
+
if (domain->fd_flags & FOD_ORDERED) {
for (i = 1; i <= 100; i++) {
diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c
index 91c5fae..3927479 100644
--- a/rgmanager/src/daemons/groups.c
+++ b/rgmanager/src/daemons/groups.c
@@ -39,7 +39,8 @@ pthread_rwlock_t resource_lock = PTHREAD_RWLOCK_INITIALIZER;
void res_build_name(char *, size_t, resource_t *);
int get_rg_state_local(char *, rg_state_t *);
-int group_migratory(char *, int);
+int group_migratory(char *groupname, int lock);
+int _group_property(char *groupname, char *property, char *ret, size_t len);
struct status_arg {
@@ -70,23 +71,9 @@ node_should_start_safe(uint32_t nodeid, cluster_member_list_t *membership,
int
node_domain_set_safe(char *domainname, int **ret, int *retlen, int *flags)
{
- fod_t *fod;
- int rv = -1, found = 0, x = 0;
-
+ int rv = 0;
pthread_rwlock_rdlock(&resource_lock);
-
- list_for(&_domains, fod, x) {
- if (!strcasecmp(fod->fd_name, domainname)) {
- found = 1;
- break;
- }
- } // while (!list_done(&_domains, fod));
-
- if (found) {
- rv = node_domain_set(fod, ret, retlen);
- *flags = fod->fd_flags;
- }
-
+ rv = node_domain_set(&_domains, domainname, ret, retlen, flags);
pthread_rwlock_unlock(&resource_lock);
return rv;
@@ -420,6 +407,47 @@ check_depend_safe(char *rg_name)
}
+int
+check_rdomain_crash(char *svcName)
+{
+ int *nodes = NULL, nodecount;
+ int *fd_nodes = NULL, fd_nodecount, fl;
+ int *isect = NULL, icount;
+ char fd_name[256];
+
+ if (_group_property(svcName, "domain", fd_name, sizeof(fd_name)) != 0)
+ goto out_free;
+
+ if (node_domain_set(_domains, fd_name, &fd_nodes,
+ &fd_nodecount, &fl) != 0)
+ goto out_free;
+
+ if (!(fl & FOD_RESTRICTED))
+ goto out_free;
+
+ if (s_intersection(fd_nodes, fd_nodecount, nodes, nodecount,
+ &isect, &icount) < 0)
+ goto out_free;
+
+ if (icount == 0) {
+ clulog(LOG_NOTICE, "Marking %s as stopped: "
+ "Restricted domain unavailable\n", svcName);
+ rt_enqueue_request(svcName, RG_STOP, NULL, 0, 0,
+ 0, 0);
+ }
+
+out_free:
+ if (fd_nodes)
+ free(fd_nodes);
+ if (nodes)
+ free(nodes);
+ if (isect)
+ free(isect);
+
+ return 0;
+}
+
+
/**
Start or failback a resource group: if it's not running, start it.
If it is running and we're a better member to run it, then ask for
@@ -433,6 +461,7 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus,
cman_node_t *mp;
int autostart, exclusive;
struct dlm_lksb lockp;
+ int fod_ret;
mp = memb_id_to_p(membership, my_id());
assert(mp);
@@ -527,10 +556,13 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus,
* Start any stopped services, or started services
* that are owned by a down node.
*/
- if (node_should_start(mp->cn_nodeid, membership, svcName, &_domains) ==
- FOD_BEST)
+ fod_ret = node_should_start(mp->cn_nodeid, membership,
+ svcName, &_domains);
+ if (fod_ret == FOD_BEST)
rt_enqueue_request(svcName, RG_START, NULL, 0, mp->cn_nodeid,
0, 0);
+ else if (fod_ret == FOD_ILLEGAL)
+ check_rdomain_crash(svcName);
}
@@ -1045,15 +1077,13 @@ out:
@return 0 on success, -1 on failure.
*/
int
-group_property(char *groupname, char *property, char *ret, size_t len)
+_group_property(char *groupname, char *property, char *ret, size_t len)
{
resource_t *res = NULL;
int x = 0;
- pthread_rwlock_rdlock(&resource_lock);
res = find_root_by_ref(&_resources, groupname);
if (!res) {
- pthread_rwlock_unlock(&resource_lock);
return -1;
}
@@ -1061,15 +1091,24 @@ group_property(char *groupname, char *property, char *ret, size_t len)
if (strcasecmp(res->r_attrs[x].ra_name, property))
continue;
strncpy(ret, res->r_attrs[x].ra_value, len);
- pthread_rwlock_unlock(&resource_lock);
return 0;
}
- pthread_rwlock_unlock(&resource_lock);
return -1;
}
+int
+group_property(char *groupname, char *property, char *ret_val, size_t len)
+{
+ int ret = -1;
+ pthread_rwlock_rdlock(&resource_lock);
+ ret = _group_property(groupname, property, ret_val, len);
+ pthread_rwlock_unlock(&resource_lock);
+ return ret;
+}
+
+
/**
Send the state of a resource group to a given file descriptor.
diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c
index 8a205ba..c57b148 100644
--- a/rgmanager/src/daemons/rg_state.c
+++ b/rgmanager/src/daemons/rg_state.c
@@ -463,6 +463,8 @@ get_rg_state_local(char *name, rg_state_t *svcblk)
* 3 = DO NOT stop service, return RG_EFORWARD
* 4 = DO NOT stop service, return RG_EAGAIN
* 5 = DO NOT stop service, return RG_EFROZEN
+ * 6 = DO NOT stop service, mark stopped and return
+ * RG_SUCCESS (0)
*/
int
svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req)
@@ -527,9 +529,10 @@ svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req)
/*
Service is marked as running but node is down.
- Doesn't make much sense to stop it.
+ Doesn't make much sense to stop it - but we need
+ to mark it stopped
*/
- ret = 2;
+ ret = 6;
break;
case RG_STATE_ERROR:
@@ -1298,6 +1301,16 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate)
clulog(LOG_DEBUG, "Unable to stop RG %s in %s state\n",
svcName, rg_state_str(svcStatus.rs_state));
return RG_EFAIL;
+ case 6:
+ /* Mark stopped, but do not do anything */
+ svcStatus.rs_last_owner = svcStatus.rs_owner;
+ svcStatus.rs_owner = 0;
+ svcStatus.rs_state = RG_STATE_STOPPED;
+ if (set_rg_state(svcName, &svcStatus) != 0) {
+ rg_unlock(&lockp);
+ return RG_EFAIL;
+ }
+ /* FALLTHROUGH */
case 2:
rg_unlock(&lockp);
return RG_ESUCCESS;
diff --git a/rgmanager/src/daemons/service_op.c b/rgmanager/src/daemons/service_op.c
index 653e988..855975f 100644
--- a/rgmanager/src/daemons/service_op.c
+++ b/rgmanager/src/daemons/service_op.c
@@ -133,8 +133,17 @@ service_op_stop(char *svcName, int do_disable, int event_type)
if (get_service_state_internal(svcName, &svcStatus) < 0)
return RG_EFAIL;
- if (svcStatus.rs_owner > 0)
- msgtarget = svcStatus.rs_owner;
+ if (svcStatus.rs_owner > 0) {
+ if (member_online(svcStatus.rs_owner)) {
+ msgtarget = svcStatus.rs_owner;
+ } else {
+ /* If the owner is not online,
+ mark the service as 'stopped' but
+ otherwise, do nothing.
+ */
+ return svc_stop(svcName, RG_STOP);
+ }
+ }
if (msg_open(MSG_CLUSTER, msgtarget, RG_PORT, &ctx, 2)< 0) {
clulog(LOG_ERR,
diff --git a/rgmanager/src/daemons/slang_event.c b/rgmanager/src/daemons/slang_event.c
index a50b1af..737e01a 100644
--- a/rgmanager/src/daemons/slang_event.c
+++ b/rgmanager/src/daemons/slang_event.c
@@ -575,24 +575,12 @@ push_int_array(int *stuff, int len)
void
sl_nodes_online(void)
{
- int i, *nodes, nodecount = 0;
+ int x, *nodes = NULL, nodecount = 0;
- cluster_member_list_t *membership = member_list();
- if (!membership)
- return;
- nodes = malloc(sizeof(int) * membership->cml_count);
- if (!nodes)
+ x = member_online_set(&nodes, &nodecount);
+ if (x < 0 || !nodes || !nodecount)
return;
- nodecount = 0;
- for (i = 0; i < membership->cml_count; i++) {
- if (membership->cml_members[i].cn_member &&
- membership->cml_members[i].cn_nodeid != 0) {
- nodes[nodecount] = membership->cml_members[i].cn_nodeid;
- ++nodecount;
- }
- }
- free_member_list(membership);
push_int_array(nodes, nodecount);
free(nodes);
}
diff --git a/rgmanager/src/resources/default_event_script.sl b/rgmanager/src/resources/default_event_script.sl
index 8e519fa..df9bce0 100644
--- a/rgmanager/src/resources/default_event_script.sl
+++ b/rgmanager/src/resources/default_event_script.sl
@@ -31,7 +31,8 @@ define move_or_start(service, node_list)
len = length(node_list);
if (len == 0) {
- debug(service, " is not runnable");
+ notice(service, " is not runnable - restricted domain offline");
+ ()=service_stop(service);
return ERR_DOMAIN;
}
More information about the Cluster-cvs
mailing list