Cluster Project branch, master, updated. cluster-2.99.02-30-gd33f4f1

Wed May 28 20:52:00 GMT 2008

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "Cluster Project".

http://sources.redhat.com/git/gitweb.cgi?p=cluster.git;a=commitdiff;h=d33f4f1df3e8f84603418d0192c1af18794d3136

The branch, master has been updated
       via  d33f4f1df3e8f84603418d0192c1af18794d3136 (commit)
       via  a0beb92c8f95009dccec00bd94390e1b4ccd742d (commit)
       via  4028ac54e1ee56c8069961473344648ec111712f (commit)
      from  d1d29545d7efeefbe4cd540bdd3dfd248d2310c0 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit d33f4f1df3e8f84603418d0192c1af18794d3136
Author: David Teigland <teigland@redhat.com>
Date:   Wed May 21 15:37:38 2008 -0500

    gfs_controld: restructuring
    
    - copying the code structure/organization of dlm_controld
    - isolate the cluster2 code from what will be the cluster3 code
    - add libgfscontrol and gfs_control
    - use libgfscontrol between gfs_controld and mount.gfs
    - eliminate umount.gfs, no longer used
    
    Signed-off-by: David Teigland <teigland@redhat.com>

commit a0beb92c8f95009dccec00bd94390e1b4ccd742d
Author: David Teigland <teigland@redhat.com>
Date:   Wed May 21 15:33:50 2008 -0500

    gfs_controld: move recover.c
    
    Move recover.c into cpg-old.c in preparation for new version.
    
    Signed-off-by: David Teigland <teigland@redhat.com>

commit 4028ac54e1ee56c8069961473344648ec111712f
Author: David Teigland <teigland@redhat.com>
Date:   Wed May 14 16:26:06 2008 -0500

    gfs_controld: rename files
    
    Renaming files in preparation for new version.
    
    Signed-off-by: David Teigland <teigland@redhat.com>

-----------------------------------------------------------------------

Summary of changes:
 gfs2/mount/Makefile                 |   17 +-
 gfs2/mount/mount.gfs2.c             |   20 +-
 gfs2/mount/umount.gfs2.c            |  168 ---
 gfs2/mount/util.c                   |  475 ++-----
 gfs2/mount/util.h                   |    2 +-
 group/gfs_control/Makefile          |   44 +
 group/gfs_control/main.c            |  212 +++
 group/gfs_controld/Makefile         |   12 +-
 group/gfs_controld/config.c         |  180 +++
 group/gfs_controld/config.h         |   47 +
 group/gfs_controld/cpg-old.c        | 2686 +++++++++++++++++++++++++++++++++
 group/gfs_controld/cpg-old.h        |   60 +
 group/gfs_controld/cpg.c            |  289 ----
 group/gfs_controld/gfs_controld.h   |   49 +
 group/gfs_controld/gfs_daemon.h     |  268 ++++
 group/gfs_controld/group.c          |   64 +-
 group/gfs_controld/lock_dlm.h       |  310 ----
 group/gfs_controld/main.c           | 1219 +++++++++-------
 group/gfs_controld/member_cman.c    |   29 +-
 group/gfs_controld/plock.c          |  228 ++--
 group/gfs_controld/recover.c        | 2805 -----------------------------------
 group/gfs_controld/util.c           |  197 +++
 group/libgfscontrol/Makefile        |   53 +
 group/libgfscontrol/libgfscontrol.h |  131 ++
 group/libgfscontrol/main.c          |  437 ++++++
 25 files changed, 5322 insertions(+), 4680 deletions(-)
 delete mode 100644 gfs2/mount/umount.gfs2.c
 create mode 100644 group/gfs_control/Makefile
 create mode 100644 group/gfs_control/main.c
 create mode 100644 group/gfs_controld/config.c
 create mode 100644 group/gfs_controld/config.h
 create mode 100644 group/gfs_controld/cpg-old.c
 create mode 100644 group/gfs_controld/cpg-old.h
 delete mode 100644 group/gfs_controld/cpg.c
 create mode 100644 group/gfs_controld/gfs_controld.h
 create mode 100644 group/gfs_controld/gfs_daemon.h
 delete mode 100644 group/gfs_controld/lock_dlm.h
 delete mode 100644 group/gfs_controld/recover.c
 create mode 100644 group/gfs_controld/util.c
 create mode 100644 group/libgfscontrol/Makefile
 create mode 100644 group/libgfscontrol/libgfscontrol.h
 create mode 100644 group/libgfscontrol/main.c

diff --git a/gfs2/mount/Makefile b/gfs2/mount/Makefile
index 3b01f04..191431f 100644
--- a/gfs2/mount/Makefile
+++ b/gfs2/mount/Makefile
@@ -11,9 +11,8 @@
 ###############################################################################
 
 TARGET1= mount.gfs2
-TARGET2= umount.gfs2
 
-all: ${TARGET1} ${TARGET2}
+all: ${TARGET1}
 
 include ../../make/defines.mk
 include $(OBJDIR)/make/cobj.mk
@@ -21,8 +20,6 @@ include $(OBJDIR)/make/clean.mk
 
 OBJS1=	mount.gfs2.o
 
-OBJS2=	umount.gfs2.o
-
 SHAREDOBJS= ondisk1.o \
 	    util.o \
 	    mtab.o
@@ -30,25 +27,23 @@ SHAREDOBJS= ondisk1.o \
 CFLAGS += -DHELPER_PROGRAM -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
 CFLAGS += -I${gfskincdir}
 CFLAGS += -I${KERNEL_SRC}/fs/gfs2/ -I${KERNEL_SRC}/include/
-CFLAGS += -I$(S)/../include
+CFLAGS += -I$(S)/../include -I../../group/libgfscontrol/
 CFLAGS += -I${incdir}
 
+LDFLAGS += -L../../group/libgfscontrol/ -lgfscontrol
 
-${TARGET1}: ${SHAREDOBJS} ${OBJS1}
-	$(CC) -o $@ $^ $(LDFLAGS)
 
-${TARGET2}: ${SHAREDOBJS} ${OBJS2}
+${TARGET1}: ${SHAREDOBJS} ${OBJS1}
 	$(CC) -o $@ $^ $(LDFLAGS)
 
 install: all
 	mkdir -p ${DESTDIR}/sbin
-	install ${TARGET1} ${TARGET2} ${DESTDIR}/sbin
+	install ${TARGET1} ${DESTDIR}/sbin
 
 uninstall:
-	${UNINSTALL} ${TARGET1} ${TARGET2} ${DESTDIR}/sbin
+	${UNINSTALL} ${TARGET1} ${DESTDIR}/sbin
 
 clean: generalclean
 
 -include $(OBJS1:.o=.d)
--include $(OBJS2:.o=.d)
 -include $(SHAREDOBJS:.o=.d)
diff --git a/gfs2/mount/mount.gfs2.c b/gfs2/mount/mount.gfs2.c
index 153b648..6c30d44 100644
--- a/gfs2/mount/mount.gfs2.c
+++ b/gfs2/mount/mount.gfs2.c
@@ -149,11 +149,11 @@ static int mount_lockproto(char *proto, struct mount_options *mo,
 	return rv;
 }
 
-static void mount_result_lockproto(char *proto, struct mount_options *mo,
+static void mount_done_lockproto(char *proto, struct mount_options *mo,
 			     	    struct gen_sb *sb, int result)
 {
 	if (!strcmp(proto, "lock_dlm"))
-		lock_dlm_mount_result(mo, sb, result);
+		lock_dlm_mount_done(mo, sb, result);
 }
 
 static void umount_lockproto(char *proto, struct mount_options *mo,
@@ -227,7 +227,9 @@ int main(int argc, char **argv)
 	   adding the mtab entry */
 	block_sigint();
 
-	if (!fake_mount) {
+	if (fake_mount)
+		goto do_mtab;
+
 	rv = mount_lockproto(proto, &mo, &sb);
 	if (rv < 0)
 		die("error mounting lockproto %s\n", proto);
@@ -235,7 +237,7 @@ int main(int argc, char **argv)
 	rv = mount(mo.dev, mo.dir, fsname, mo.flags, mo.extra_plus);
 	if (rv) {
 		log_debug("mount(2) failed error %d errno %d", rv, errno);
-		mount_result_lockproto(proto, &mo, &sb, rv);
+		mount_done_lockproto(proto, &mo, &sb, rv);
 
 		if (!(mo.flags & MS_REMOUNT))
 			umount_lockproto(proto, &mo, &sb, errno);
@@ -246,17 +248,19 @@ int main(int argc, char **argv)
 		    strerror(errno));
 	}
 	log_debug("mount(2) ok");
-	mount_result_lockproto(proto, &mo, &sb, 0);
-	}
+	mount_done_lockproto(proto, &mo, &sb, 0);
+
+ do_mtab:
+	if (no_mtab)
+		goto out;
 
-	if (!no_mtab) {
 	if (mo.flags & MS_REMOUNT) {
                 del_mtab_entry(&mo);
                 add_mtab_entry(&mo);
         } else
 		add_mtab_entry(&mo);
-	}
 
+ out:
 	unblock_sigint();
 
 	return rv ? 1 : 0;
diff --git a/gfs2/mount/umount.gfs2.c b/gfs2/mount/umount.gfs2.c
deleted file mode 100644
index 009e16e..0000000
--- a/gfs2/mount/umount.gfs2.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License v.2.
- */
-
-#include "util.h"
-
-char *prog_name;
-char *fsname;
-char *expert;
-int verbose;
-static sigset_t old_sigset;
-
-static void print_version(void)
-{
-	printf("umount.gfs2 %s (built %s %s)\n", RELEASE_VERSION,
-	       __DATE__, __TIME__);
-}
-
-static void print_usage(void)
-{
-	printf("Usage:\n");
-	printf("This program is called by umount(8), it should not be used directly.\n");
-	printf("If umount(8) fails to call umount.gfs2, you can clean up with\n");
-	printf("> umount.gfs2 -v -X lock_dlm <mountpoint>\n");
-
-}
-
-static void block_sigint(void)
-{
-	sigset_t new;
-
-	sigemptyset(&new);
-	sigaddset(&new, SIGINT);
-	sigprocmask(SIG_BLOCK, &new, &old_sigset);
-}
-
-static void unblock_sigint(void)
-{
-	sigprocmask(SIG_SETMASK, &old_sigset, NULL);
-}
-
-static void read_options(int argc, char **argv, struct mount_options *mo)
-{
-	int cont = 1;
-	int optchar;
-	int l;
-
-	/* FIXME: check for "quiet" option and don't print in that case */
-
-	while (cont) {
-		optchar = getopt(argc, argv, "fhVvX:r");
-
-		switch (optchar) {
-		case EOF:
-			cont = 0;
-			break;
-
-		case 'f':    /* autofs umount from /sbin/halt uses this */
-			break;
-
-		case 'v':
-			++verbose;
-			break;
-
-		case 'X':
-			expert = strdup(optarg);
-			log_debug("umount expert override: %s", expert);
-			break;
-
-		case 'h':
-			print_usage();
-			exit(EXIT_SUCCESS);
-
-		case 'V':
-			print_version();
-			exit(EXIT_SUCCESS);
-
-		case 'r':
-			break; /* used by umount to remount ro if umount fails */
-
-		default:
-			break;
-		}
-	}
-
-	if (optind < argc && argv[optind]) {
-		strncpy(mo->dir, argv[optind], PATH_MAX);
-		l = strlen(mo->dir) - 1;
-		while (l > 0 && mo->dir[l] == '/') {
-			mo->dir[l] = '\0';
-			l--;
-		};
-	}
-
-	log_debug("umount %s", mo->dir);
-}
-
-static void check_options(struct mount_options *mo)
-{
-	if (!strlen(mo->dir))
-		die("no mount point specified\n");
-}
-
-static int umount_lockproto(char *proto, struct mount_options *mo,
-			     struct gen_sb *sb)
-{
-	int rv = 0;
-
-	if (!strcmp(proto, "lock_dlm"))
-		rv = lock_dlm_leave(mo, sb, 0);
-	return rv;
-}
-
-int main(int argc, char **argv)
-{
-	struct mount_options mo;
-	struct gen_sb sb;
-	char *proto;
-	int rv;
-
-	memset(&mo, 0, sizeof(mo));
-	memset(&sb, 0, sizeof(sb));
-
-	prog_name = argv[0];
-
-	if (!strstr(prog_name, "gfs"))
-		die("invalid umount helper name \"%s\"\n", prog_name);
-
-	fsname = (strstr(prog_name, "gfs2")) ? "gfs2" : "gfs";
-
-	if (argc < 2) {
-		print_usage();
-		exit(EXIT_SUCCESS);
-	}
-
-	read_options(argc, argv, &mo);
-
-	if (expert)
-		return umount_lockproto(expert, &mo, &sb);
-
-	check_options(&mo);
-	read_proc_mounts(&mo);
-	get_sb(mo.dev, &sb);
-	parse_opts(&mo);
-
-	block_sigint();
-
-	rv = umount(mo.dir);
-	if (rv) {
-		if (errno == EBUSY)
-			die("%s: device is busy.\n", mo.dir);
-		else
-			die("error %d unmounting %s\n", errno, mo.dir);
-	}
-	proto = select_lockproto(&mo, &sb);
-	umount_lockproto(proto, &mo, &sb);
-
-	del_mtab_entry(&mo);
-
-	unblock_sigint();
-
-	return 0;
-}
-
diff --git a/gfs2/mount/util.c b/gfs2/mount/util.c
index cbb2973..f8e6fee 100644
--- a/gfs2/mount/util.c
+++ b/gfs2/mount/util.c
@@ -7,15 +7,13 @@
  */
 
 #include "util.h"
+#include "libgfscontrol.h"
 
 extern char *prog_name;
 extern char *fsname;
 extern int verbose;
-static int gfs_controld_fd = -1;
-static int adding_another_mountpoint;
 
-#define LOCK_DLM_SOCK_PATH "gfs_controld_sock"	/* FIXME: use a header */
-#define MAXLINE 256			/* size of messages with gfs_controld */
+static int gfs_controld_fd;
 
 /* opt_map stuff from util-linux */
 
@@ -320,268 +318,123 @@ char *select_lockproto(struct mount_options *mo, struct gen_sb *sb)
 		return sb->lockproto;
 }
 
-static int gfs_controld_connect(void)
-{
-	struct sockaddr_un sun;
-	socklen_t addrlen;
-	int rv, fd;
-
-	fd = socket(PF_UNIX, SOCK_STREAM, 0);
-	if (fd < 0) {
-		warn("can't create socket for gfs_controld connection: %s",
-		     strerror(errno));
-		goto out;
-	}
-
-	memset(&sun, 0, sizeof(sun));
-	sun.sun_family = AF_UNIX;
-	strcpy(&sun.sun_path[1], LOCK_DLM_SOCK_PATH);
-	addrlen = sizeof(sa_family_t) + strlen(sun.sun_path+1) + 1;
-
-	rv = connect(fd, (struct sockaddr *) &sun, addrlen);
-	if (rv < 0) {
-		warn("can't connect to gfs_controld: %s", strerror(errno));
-		close(fd);
-		fd = rv;
-	}
- out:
-	return fd;
-}
-
-#if 0
-/* We create a pipe and pass the receiving end to gfs_controld.  If the
-   mount fails, we write an error message to this pipe.  gfs_controld monitors
-   this fd outside its main poll loop because it may need to detect a mount
-   failure while watching for the kernel mount (while waiting for the kernel
-   mount, gfs_controld is _not_ in its main poll loop which is why the normal
-   leave message w/ mnterr we send isn't sufficient.) */
-
-void setup_mount_error_fd(int socket)
-{
-	struct msghdr msg;
-	struct cmsghdr *cmsg;
-	struct iovec vec;
-	char tmp[CMSG_SPACE(sizeof(int))];
-	char ch = '\0';
-	ssize_t n;
-	int rv, fds[2];
-
-	rv = pipe(fds);
-	if (rv < 0) {
-		log_debug("setup_mount_error_fd pipe error %d %d", rv, errno);
-		return;
-	}
-
-	memset(&msg, 0, sizeof(msg));
-
-	msg.msg_control = (caddr_t)tmp;
-	msg.msg_controllen = CMSG_LEN(sizeof(int));
-	cmsg = CMSG_FIRSTHDR(&msg);
-	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
-	cmsg->cmsg_level = SOL_SOCKET;
-	cmsg->cmsg_type = SCM_RIGHTS;
-	*(int *)CMSG_DATA(cmsg) = fds[0];
-
-	vec.iov_base = &ch;
-	vec.iov_len = 1;
-	msg.msg_iov = &vec;
-	msg.msg_iovlen = 1;
-
-	n = sendmsg(socket, &msg, 0);
-	if (n < 0) {
-		log_debug("setup_mount_error_fd sendmsg error %d %d", n, errno);
-		close(fds[0]);
-		close(fds[1]);
-		return;
-	}
-
-	mount_error_fd = fds[1];
-
-	log_debug("setup_mount_error_fd %d %d", fds[0], fds[1]);
-}
-#endif
-
 int lock_dlm_join(struct mount_options *mo, struct gen_sb *sb)
 {
-	int i, fd, rv;
-	char buf[MAXLINE];
-	char *dir, *proto, *table, *options;
-
-	i = 0;
-	do {
-		fd = gfs_controld_connect();
-		if (fd <= 0)
-			sleep(1);
-	} while (fd <= 0 && ++i < 10);
-
-	/* FIXME: should we start the daemon here? */
-	if (fd < 0) {
-		warn("gfs_controld not running");
-		rv = -1;
-		goto out;
-	}
+	struct gfsc_mount_args ma;
+	int fd, rv, result;
 
-	dir = mo->dir;
-	proto = "lock_dlm";
-	options = mo->opts;
+	memset(&ma, 0, sizeof(ma));
 
+	strncpy(ma.dir, mo->dir, PATH_MAX);
+	strncpy(ma.type, fsname, PATH_MAX);
+	strncpy(ma.proto, "lock_dlm", PATH_MAX);
+	strncpy(ma.options, mo->opts, PATH_MAX);
+	strncpy(ma.dev, mo->dev, PATH_MAX);
 	if (mo->locktable[0])
-		table = mo->locktable;
+		strncpy(ma.table, mo->locktable, PATH_MAX);
 	else
-		table = sb->locktable;
+		strncpy(ma.table, sb->locktable, PATH_MAX);
 
-	/*
-	 * send request to gfs_controld for it to join mountgroup:
-	 * "join <mountpoint> gfs2 lock_dlm <locktable> <options> <dev>"
-	 */
-
-	memset(buf, 0, sizeof(buf));
-	rv = snprintf(buf, MAXLINE, "join %s %s %s %s %s %s",
-		      dir, fsname, proto, table, options, mo->dev);
-	if (rv >= MAXLINE) {
-		warn("gfs_controld message too long: %d \"%s\"", rv, buf);
-		rv = -1;
-		goto out;
+	fd = gfsc_fs_connect();
+	if (fd < 0) {
+		warn("gfs_controld join connect error: %s", strerror(errno));
+		return fd;
 	}
 
-	log_debug("message to gfs_controld: asking to join mountgroup:");
-	log_debug("write \"%s\"", buf);
+	/* tell gfs_controld to join the mountgroup */
 
-	rv = write(fd, buf, sizeof(buf));
+	rv = gfsc_fs_join(fd, &ma);
 	if (rv < 0) {
-		warn("gfs_controld write error: %s", strerror(errno));
-		goto out;
+		warn("gfs_controld join write error: %s", strerror(errno));
+		goto fail;
 	}
 
-#if 0
-	setup_mount_error_fd(fd);
-#endif
-
-	/*
-	 * read response from gfs_controld to our join request:
-	 * it sends back an int as a string, 0 or -EXXX
-	 */
+	/* read the result of the join from gfs_controld */
 
-	memset(buf, 0, sizeof(buf));
-	rv = read(fd, buf, sizeof(buf));
+	rv = gfsc_fs_result(fd, &result, &ma);
 	if (rv < 0) {
-		warn("error reading result from gfs_controld: %s",
-		     strerror(errno));
-		goto out;
+		warn("gfs_controld result read error: %s", strerror(errno));
+		goto fail;
 	}
-	rv = atoi(buf);
+
+	rv = result;
 
 	switch (rv) {
 	case 0:
+	case -EALREADY:
 		break;
 
-	case -EEXIST:
-		warn("mount group already exists. "
-		     "Duplicate locktable name %s, or %s already mounted",
-		     table, mo->dev);
-		goto out;
-
 	case -EPROTONOSUPPORT:
 		warn("lockproto not supported");
-		goto out;
+		goto fail;
 
 	case -EOPNOTSUPP:
 		warn("jid, first and id are reserved options");
-		goto out;
+		goto fail;
 
 	case -EBADFD:
 		warn("no colon found in table name");
-		goto out;
+		goto fail;
 
 	case -ENAMETOOLONG:
 		warn("fs name too long");
-		goto out;
+		goto fail;
 
 	case -ESTALE:
 		warn("fs is being unmounted");
-		goto out;
+		goto fail;
 
 	case -EADDRINUSE:
 		warn("different fs appears to exist with the same name");
-		goto out;
+		goto fail;
 
 	case -EBUSY:
 		warn("mount point already used or other mount in progress");
-		goto out;
-
-	case -EALREADY:
-		log_debug("fs already mounted, adding mountpoint");
-		adding_another_mountpoint = 1;
-		rv = 0;
-		goto out;
+		goto fail;
 
 	case -ENOMEM:
 		warn("out of memory");
-		goto out;
+		goto fail;
 
 	case -EBADR:
 		warn("fs is for a different cluster");
-		goto out;
+		goto fail;
 
 	case -ENOANO:
 		warn("node not a member of the default fence domain");
-		goto out;
+		goto fail;
 
 	case -EROFS:
 		warn("read-only mount invalid with spectator option");
-		goto out;
+		goto fail;
 
 	case -EMLINK:
 		warn("option string too long");
-		goto out;
+		goto fail;
 
 	default:
 		warn("gfs_controld join error: %d", rv);
-		goto out;
-	}
-
-	log_debug("message from gfs_controld: response to join request:");
-	log_debug("lock_dlm_join: read \"%s\"", buf);
-
-	/*
-	 * read mount-option string from gfs_controld that we are to
-	 * use for the mount syscall; or possibly error message
-	 */
-
-	memset(buf, 0, sizeof(buf));
-	rv = read(fd, buf, sizeof(buf));
-	if (rv < 0) {
-		warn("gfs_controld options read error: %d", rv);
-		goto out;
+		goto fail;
 	}
 
-	log_debug("message from gfs_controld: mount options:");
-	log_debug("lock_dlm_join: read \"%s\"", buf);
-
 	/*
-	 * gfs_controld returns "hostdata=jid=X:id=Y:first=Z"
-	 * this is first combined with any hostdata the user gave on
+	 * In addition to the result, gfs_controld also returns
+	 * "hostdata=jid=X:id=Y:first=Z" in ma.hostdata.
+	 * This is first combined with any hostdata the user gave on
 	 * the command line and then the full hostdata is combined
 	 * with the "extra" mount otions into the "extra_plus" string.
-	 * If we're not allowed to mount, "error: foo" is returned.
 	 */
 
-	if (!strncmp(buf, "error", 5)) {
-		warn("%s", buf);
-		rv = -1;
-		goto out;
-	}
-
-	if (strlen(mo->hostdata) + strlen(buf) + 1 > PATH_MAX) {
+	if (strlen(mo->hostdata) + strlen(ma.hostdata) + 1 > PATH_MAX) {
 		warn("hostdata too long");
 		rv = -1;
-		goto out;
+		goto fail;
 	}
 
 	if (!mo->hostdata[0])
-		snprintf(mo->hostdata, PATH_MAX, "%s", buf);
+		snprintf(mo->hostdata, PATH_MAX, "%s", ma.hostdata);
 	else {
-		char *p = strstr(buf, "=") + 1;
+		char *p = strstr(ma.hostdata, "=") + 1;
 		strcat(mo->hostdata, ":");
 		strcat(mo->hostdata, p);
 	}
@@ -594,143 +447,76 @@ int lock_dlm_join(struct mount_options *mo, struct gen_sb *sb)
 		snprintf(mo->extra_plus, PATH_MAX, "%s,%s",
 			 mo->extra, mo->hostdata);
 
-	log_debug("lock_dlm_join: extra_plus: \"%s\"", mo->extra_plus);
-	rv = 0;
- out:
-#if 0
-	close(fd);
-#endif
-	gfs_controld_fd = fd;
+	/* keep gfs_controld connection open and reuse it below to
+	   send the result of mount(2) to gfs_controld, except in
+	   the case of another mount (EALREADY) */
+	   
+	if (rv == -EALREADY)
+		gfsc_fs_disconnect(fd);
+	else
+		gfs_controld_fd = fd;
+
+	return 0;
+
+ fail:
+	gfsc_fs_disconnect(fd);
 	return rv;
 }
 
-void lock_dlm_mount_result(struct mount_options *mo, struct gen_sb *sb,
-			   int result)
+void lock_dlm_mount_done(struct mount_options *mo, struct gen_sb *sb,
+			 int result)
 {
+	struct gfsc_mount_args ma;
 	int rv;
-	char buf[MAXLINE];
 
-	/* if we didn't do the lock_dlm_join */
-	if (gfs_controld_fd <= 0)
+	if (!gfs_controld_fd)
 		return;
 
-	memset(buf, 0, sizeof(buf));
-	rv = snprintf(buf, MAXLINE, "mount_result %s %s %d", mo->dir, fsname,
-		      result);
-	if (rv >= MAXLINE) {
-		warn("lock_dlm_mount_result: message too long: %d \"%s\"\n",
-		     rv, buf);
-		goto out;
-	}
+	memset(&ma, 0, sizeof(ma));
 
-	log_debug("lock_dlm_mount_result: write \"%s\"", buf);
+	strncpy(ma.dir, mo->dir, PATH_MAX);
+	strncpy(ma.type, fsname, PATH_MAX);
+	strncpy(ma.proto, "lock_dlm", PATH_MAX);
+	strncpy(ma.options, mo->opts, PATH_MAX);
+	strncpy(ma.dev, mo->dev, PATH_MAX);
+	if (mo->locktable[0])
+		strncpy(ma.table, mo->locktable, PATH_MAX);
+	else
+		strncpy(ma.table, sb->locktable, PATH_MAX);
 
-	rv = write(gfs_controld_fd, buf, sizeof(buf));
-	if (rv < 0) {
-		warn("lock_dlm_mount_result: gfs_controld write error: %d", rv);
-	}
- out:
-	close(gfs_controld_fd);
+	/* tell gfs_controld the result of mount(2) */
+
+	rv = gfsc_fs_mount_done(gfs_controld_fd, &ma, result);
+	if (rv)
+		warn("gfs_controld mount_done write error: %s", strerror(errno));
+
+	gfsc_fs_disconnect(gfs_controld_fd);
 }
 
 int lock_dlm_leave(struct mount_options *mo, struct gen_sb *sb, int mnterr)
 {
-	int i, fd, rv;
-	char buf[MAXLINE];
-
-	if (mnterr && adding_another_mountpoint)
-		return 0;
-
-	i = 0;
-	do {
-		fd = gfs_controld_connect();
-		if (fd <= 0)
-			sleep(1);
-	} while (fd <= 0 && ++i < 10);
-
-	if (fd <= 0) {
-		warn("gfs_controld not running");
-		rv = -1;
-		goto out;
-	}
-
-	/*
-	 * send request to gfs_controld for it to leave mountgroup:
-	 * "leave <mountpoint> <fstype> <mnterr>"
-	 *
-	 * mnterr is 0 if this leave is associated with an unmount.
-	 * mnterr is !0 if this leave is due to a failed kernel mount
-	 * in which case gfs_controld shouldn't wait for the kernel mount
-	 * to complete before doing the leave.
-	 */
-
-	memset(buf, 0, sizeof(buf));
-	rv = snprintf(buf, MAXLINE, "leave %s %s %d", mo->dir, fsname, mnterr);
-	if (rv >= MAXLINE) {
-		warn("lock_dlm_leave: message too long: %d \"%s\"\n", rv, buf);
-		rv = -1;
-		goto out;
-	}
-
-	log_debug("message to gfs_controld: asking to leave mountgroup:");
-	log_debug("lock_dlm_leave: write \"%s\"", buf);
-
-#if 0
-	if (mnterr && mount_error_fd) {
-		rv = write(mount_error_fd, buf, sizeof(buf));
-		log_debug("lock_dlm_leave: write to mount_error_fd %d", rv);
-	}
-#endif
+	struct gfsc_mount_args ma;
+	int rv;
 
-	rv = write(fd, buf, sizeof(buf));
-	if (rv < 0) {
-		warn("lock_dlm_leave: gfs_controld write error: %d", rv);
-		goto out;
-	}
+	memset(&ma, 0, sizeof(ma));
 
-	/*
-	 * read response from gfs_controld to our leave request:
-	 * int as a string, 0 or -EXXX
-	 */
+	strncpy(ma.dir, mo->dir, PATH_MAX);
+	strncpy(ma.type, fsname, PATH_MAX);
 
-	memset(buf, 0, sizeof(buf));
-	rv = read(fd, buf, sizeof(buf));
-	if (rv < 0) {
-		warn("lock_dlm_leave: gfs_controld read error: %d", rv);
-		goto out;
-	}
-	rv = atoi(buf);
-	if (rv < 0) {
-		warn("lock_dlm_leave: gfs_controld leave error: %d", rv);
-		goto out;
-	}
+	rv = gfsc_fs_leave(&ma, mnterr);
+	if (rv)
+		warn("leave: gfs_controld leave error: %s", strerror(errno));
 
-	log_debug("message from gfs_controld: response to leave request:");
-	log_debug("lock_dlm_leave: read \"%s\"", buf);
-	rv = 0;
- out:
-	close(fd);
 	return rv;
 }
 
 int lock_dlm_remount(struct mount_options *mo, struct gen_sb *sb)
 {
-	int i, fd, rv;
-	char buf[MAXLINE];
+	struct gfsc_mount_args ma;
 	char *mode;
+	int fd, rv, result;
 
-	i = 0;
-	do {
-		fd = gfs_controld_connect();
-		if (fd <= 0)
-			sleep(1);
-	} while (fd <= 0 && ++i < 10);
-
-	if (fd <= 0) {
-		warn("gfs_controld not running");
-		rv = -1;
-		goto out;
-	}
+	memset(&ma, 0, sizeof(ma));
 
 	/* FIXME: how to check for spectator remounts, we want
 	   to disallow remount to/from spectator */
@@ -740,78 +526,37 @@ int lock_dlm_remount(struct mount_options *mo, struct gen_sb *sb)
 	else
 		mode = "rw";
 
-	/*
-	 * send request to gfs_controld for it to remount:
-	 * "remount <mountpoint> gfs2 <mode>"
-	 */
+	strncpy(ma.dir, mo->dir, PATH_MAX);
+	strncpy(ma.type, fsname, PATH_MAX);
+	strncpy(ma.options, mode, PATH_MAX);
 
-	memset(buf, 0, sizeof(buf));
-	rv = snprintf(buf, MAXLINE, "remount %s %s %s", mo->dir, fsname, mode);
-	if (rv >= MAXLINE) {
-		warn("remount message too large: %d \"%s\"\n", rv, buf);
-		rv = -1;
-		goto out;
-	}
-
-	log_debug("message to gfs_controld: asking to remount:");
-	log_debug("lock_dlm_remount: write \"%s\"", buf);
-
-	rv = write(fd, buf, sizeof(buf));
-	if (rv < 0) {
-		warn("lock_dlm_remount: gfs_controld write error: %d", rv);
-		goto out;
+	fd = gfsc_fs_connect();
+	if (fd < 0) {
+		warn("gfs_controld remount connect error: %s", strerror(errno));
+		return fd;
 	}
 
-	/*
-	 * read response from gfs_controld
-	 * int as a string
-	 * 1: go ahead
-	 * -EXXX: error
-	 * 0: wait for second result
-	 */
+	/* tell gfs_controld about the new mount options */
 
-	memset(buf, 0, sizeof(buf));
-	rv = read(fd, buf, sizeof(buf));
-	if (rv < 0) {
-		warn("lock_dlm_remount: gfs_controld read1 error: %d", rv);
-		goto out;
-	}
-	rv = atoi(buf);
-	if (rv < 0) {
-		warn("lock_dlm_remount: gfs_controld remount error: %d", rv);
-		goto out;
-	}
-	if (rv == 1) {
-		rv = 0;
+	rv = gfsc_fs_remount(fd, &ma);
+	if (rv) {
+		warn("gfs_controld remount write error: %s", strerror(errno));
 		goto out;
 	}
 
-	log_debug("message from gfs_controld: response to remount request:");
-	log_debug("lock_dlm_remount: read \"%s\"", buf);
+	/* read the result of the remount from gfs_controld */
 
-	/*
-	 * read second result from gfs_controld
-	 */
-
-	memset(buf, 0, sizeof(buf));
-	rv = read(fd, buf, sizeof(buf));
+	rv = gfsc_fs_result(fd, &result, &ma);
 	if (rv < 0) {
-		warn("lock_dlm_remount: gfs_controld read2 error: %d", rv);
+		warn("gfs_controld result read error: %s", strerror(errno));
 		goto out;
 	}
 
-	log_debug("message from gfs_controld: remount result:");
-	log_debug("lock_dlm_remount: read \"%s\"", buf);
-
-	if (!strncmp(buf, "error", 5)) {
-		warn("%s", buf);
-		rv = -1;
-		goto out;
-	}
-
-	rv = 0;
+	rv = result;
+	if (rv)
+		warn("remount not allowed from gfs_controld");
  out:
-	close(fd);
+	gfsc_fs_disconnect(fd);
 	return rv;
 }
 
diff --git a/gfs2/mount/util.h b/gfs2/mount/util.h
index 89faabc..ffa2dd6 100644
--- a/gfs2/mount/util.h
+++ b/gfs2/mount/util.h
@@ -88,7 +88,7 @@ void parse_opts(struct mount_options *mo);
 void read_proc_mounts(struct mount_options *mo);
 int get_sb(char *device, struct gen_sb *sb_out);
 int lock_dlm_join(struct mount_options *mo, struct gen_sb *sb);
-void lock_dlm_mount_result(struct mount_options *mo, struct gen_sb *sb, int result);
+void lock_dlm_mount_done(struct mount_options *mo, struct gen_sb *sb, int result);
 int lock_dlm_leave(struct mount_options *mo, struct gen_sb *sb, int mnterr);
 int lock_dlm_remount(struct mount_options *mo, struct gen_sb *sb);
 
diff --git a/group/gfs_control/Makefile b/group/gfs_control/Makefile
new file mode 100644
index 0000000..772a0d0
--- /dev/null
+++ b/group/gfs_control/Makefile
@@ -0,0 +1,44 @@
+###############################################################################
+###############################################################################
+##
+##  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+##
+##  This copyrighted material is made available to anyone wishing to use,
+##  modify, copy, or redistribute it subject to the terms and conditions
+##  of the GNU General Public License v.2.
+##
+###############################################################################
+###############################################################################
+
+TARGET= gfs_control
+
+SBINDIRT=$(TARGET)
+
+all: depends ${TARGET}
+
+include ../../make/defines.mk
+include $(OBJDIR)/make/cobj.mk
+include $(OBJDIR)/make/clean.mk
+include $(OBJDIR)/make/install.mk
+include $(OBJDIR)/make/uninstall.mk
+
+OBJS=	main.o
+
+#CFLAGS += -I${gfscontrolincdir}
+CFLAGS += -I../libgfscontrol/
+CFLAGS += -I${incdir}
+CFLAGS += -I${KERNEL_SRC}/include/
+
+#LDFLAGS += -L${gfscontrollibdir} -lgfscontrol
+LDFLAGS += -L../libgfscontrol/ -lgfscontrol
+
+
+${TARGET}: ${OBJS}
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+clean: generalclean
+
+depends:
+	$(MAKE) -C ../libgfscontrol all
+
+-include $(OBJS:.o=.d)
diff --git a/group/gfs_control/main.c b/group/gfs_control/main.c
new file mode 100644
index 0000000..1b5752b
--- /dev/null
+++ b/group/gfs_control/main.c
@@ -0,0 +1,212 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2007-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <sys/types.h>
+#include <sys/un.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <netinet/in.h>
+
+#include "libgfscontrol.h"
+
+#define OPTION_STRING			"hV"
+
+#define OP_LIST				1
+#define OP_DUMP				2
+#define OP_PLOCKS			3
+#define OP_JOIN				4
+#define OP_LEAVE			5
+#define OP_JOINLEAVE			6
+
+static char *prog_name;
+static char *fsname;
+static int operation;
+static int opt_ind;
+
+static void print_usage(void)
+{
+	printf("Usage:\n");
+	printf("\n");
+	printf("%s [options] [ls|dump|plocks]\n", prog_name);
+	printf("\n");
+	printf("Options:\n");
+	printf("  -h               Print this help, then exit\n");
+	printf("  -V               Print program version information, then exit\n");
+	printf("\n");
+}
+
+static void decode_arguments(int argc, char **argv)
+{
+	int cont = 1;
+	int optchar;
+	int need_fsname;
+
+	while (cont) {
+		optchar = getopt(argc, argv, OPTION_STRING);
+
+		switch (optchar) {
+		case 'h':
+			print_usage();
+			exit(EXIT_SUCCESS);
+			break;
+
+		case 'V':
+			printf("%s %s (built %s %s)\n",
+				prog_name, RELEASE_VERSION, __DATE__, __TIME__);
+			/* printf("%s\n", REDHAT_COPYRIGHT); */
+			exit(EXIT_SUCCESS);
+			break;
+
+		case ':':
+		case '?':
+			fprintf(stderr, "Please use '-h' for usage.\n");
+			exit(EXIT_FAILURE);
+			break;
+
+		case EOF:
+			cont = 0;
+			break;
+
+		default:
+			fprintf(stderr, "unknown option: %c\n", optchar);
+			exit(EXIT_FAILURE);
+			break;
+		};
+	}
+
+	need_fsname = 1;
+
+	while (optind < argc) {
+
+		if (!strncmp(argv[optind], "leave", 5) &&
+			   (strlen(argv[optind]) == 5)) {
+			operation = OP_LEAVE;
+			opt_ind = optind + 1;
+			break;
+		} else if (!strncmp(argv[optind], "ls", 2) &&
+			   (strlen(argv[optind]) == 2)) {
+			operation = OP_LIST;
+			opt_ind = optind + 1;
+			need_fsname = 0;
+			break;
+		} else if (!strncmp(argv[optind], "dump", 4) &&
+			   (strlen(argv[optind]) == 4)) {
+			operation = OP_DUMP;
+			opt_ind = optind + 1;
+			need_fsname = 0;
+			break;
+		} else if (!strncmp(argv[optind], "plocks", 6) &&
+			   (strlen(argv[optind]) == 6)) {
+			operation = OP_PLOCKS;
+			opt_ind = optind + 1;
+			break;
+		}
+
+		optind++;
+	}
+
+	if (!operation || !opt_ind) {
+		print_usage();
+		exit(EXIT_FAILURE);
+	}
+
+	if (optind < argc - 1)
+		fsname = argv[opt_ind];
+	else if (need_fsname) {
+		fprintf(stderr, "fs name required\n");
+		exit(EXIT_FAILURE);
+	}
+}
+
+static int do_write(int fd, void *buf, size_t count)
+{
+	int rv, off = 0;
+
+ retry:
+	rv = write(fd, buf + off, count);
+	if (rv == -1 && errno == EINTR)
+		goto retry;
+	if (rv < 0)
+		return rv;
+
+	if (rv != count) {
+		count -= rv;
+		off += rv;
+		goto retry;
+	}
+	return 0;
+}
+
+void do_leave(char *name)
+{
+}
+
+static void do_list(char *name)
+{
+}
+
+static void do_plocks(char *name)
+{
+	char buf[GFSC_DUMP_SIZE];
+
+	memset(buf, 0, sizeof(buf));
+
+	gfsc_dump_plocks(name, buf);
+
+	do_write(STDOUT_FILENO, buf, strlen(buf));
+}
+
+static void do_dump(void)
+{
+	char buf[GFSC_DUMP_SIZE];
+
+	memset(buf, 0, sizeof(buf));
+
+	gfsc_dump_debug(buf);
+
+	do_write(STDOUT_FILENO, buf, strlen(buf));
+}
+
+int main(int argc, char **argv)
+{
+	prog_name = argv[0];
+	decode_arguments(argc, argv);
+
+	switch (operation) {
+
+	case OP_LEAVE:
+		do_leave(fsname);
+		break;
+
+	case OP_LIST:
+		do_list(fsname);
+		break;
+
+	case OP_DUMP:
+		do_dump();
+		break;
+
+	case OP_PLOCKS:
+		do_plocks(fsname);
+		break;
+	}
+	return 0;
+}
+
diff --git a/group/gfs_controld/Makefile b/group/gfs_controld/Makefile
index 2b05312..5b23bfd 100644
--- a/group/gfs_controld/Makefile
+++ b/group/gfs_controld/Makefile
@@ -24,17 +24,19 @@ include $(OBJDIR)/make/uninstall.mk
 
 OBJS= 	main.o \
 	member_cman.o \
-	cpg.o \
+	config.o \
+	cpg-old.o \
 	group.o \
-	plock.o \
-	recover.o
+	util.o \
+	plock.o
 
-CFLAGS += -I${ccsincdir} -I${cmanincdir} -I${openaisincdir}
+CFLAGS += -I${ccsincdir} -I${cmanincdir} -I${openaisincdir} -I${fencedincdir}
 CFLAGS += -I${KERNEL_SRC}/include/
-CFLAGS += -I$(S)/../lib/ -I$(S)/../include/
+CFLAGS += -I$(S)/../lib/ -I$(S)/../include/ -I../libgfscontrol/
 CFLAGS += -I${incdir}
 
 LDFLAGS += -L${ccslibdir} -L${cmanlibdir} -lcman -lccs
+LDFLAGS += -L${fencedlibdir} -lfenced
 LDFLAGS += -L${openaislibdir} -lcpg -lSaCkpt
 LDFLAGS += -L${libdir}
 LDFLAGS += -L../lib -lgroup
diff --git a/group/gfs_controld/config.c b/group/gfs_controld/config.c
new file mode 100644
index 0000000..6731ac6
--- /dev/null
+++ b/group/gfs_controld/config.c
@@ -0,0 +1,180 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <sys/types.h>
+#include <asm/types.h>
+#include <sys/uio.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/utsname.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <netdb.h>
+#include <limits.h>
+#include <unistd.h>
+#include <dirent.h>
+
+#include "gfs_daemon.h"
+#include "config.h"
+#include "ccs.h"
+
+/* was a config value set on command line?, 0 or 1.
+   optk is a kernel option, optd is a daemon option */
+
+int optd_groupd_compat;
+int optd_enable_withdraw;
+int optd_enable_plock;
+int optd_plock_debug;
+int optd_plock_rate_limit;
+int optd_plock_ownership;
+int optd_drop_resources_time;
+int optd_drop_resources_count;
+int optd_drop_resources_age;
+
+/* actual config value from command line, cluster.conf, or default.
+   cfgk is a kernel config value, cfgd is a daemon config value */
+
+int cfgd_groupd_compat		= DEFAULT_GROUPD_COMPAT;
+int cfgd_enable_withdraw	= DEFAULT_ENABLE_WITHDRAW;
+int cfgd_enable_plock		= DEFAULT_ENABLE_PLOCK;
+int cfgd_plock_debug		= DEFAULT_PLOCK_DEBUG;
+int cfgd_plock_rate_limit	= DEFAULT_PLOCK_RATE_LIMIT;
+int cfgd_plock_ownership	= DEFAULT_PLOCK_OWNERSHIP;
+int cfgd_drop_resources_time	= DEFAULT_DROP_RESOURCES_TIME;
+int cfgd_drop_resources_count	= DEFAULT_DROP_RESOURCES_COUNT;
+int cfgd_drop_resources_age	= DEFAULT_DROP_RESOURCES_AGE;
+
+
+static int open_ccs(void)
+{
+	int i = 0, cd;
+
+	while ((cd = ccs_connect()) < 0) {
+		sleep(1);
+		if (++i > 9 && !(i % 10))
+			log_error("connect to ccs error %d, "
+				  "check ccsd or cluster status", cd);
+	}
+	return cd;
+}
+
+static void read_ccs_int(int cd, char *path, int *config_val)
+{
+	char *str;
+	int val;
+	int error;
+
+	error = ccs_get(cd, path, &str);
+	if (error || !str)
+		return;
+
+	val = atoi(str);
+
+	if (val < 0) {
+		log_error("ignore invalid value %d for %s", val, path);
+		return;
+	}
+
+	*config_val = val;
+	log_debug("%s is %u", path, val);
+	free(str);
+}
+
+#define GROUPD_COMPAT_PATH "/cluster/group/@groupd_compat"
+#define ENABLE_WITHDRAW_PATH "/cluster/gfs_controld/@enable_withdraw"
+#define ENABLE_PLOCK_PATH "/cluster/gfs_controld/@enable_plock"
+#define PLOCK_DEBUG_PATH "/cluster/gfs_controld/@plock_debug"
+#define PLOCK_RATE_LIMIT_PATH "/cluster/gfs_controld/@plock_rate_limit"
+#define PLOCK_OWNERSHIP_PATH "/cluster/gfs_controld/@plock_ownership"
+#define DROP_RESOURCES_TIME_PATH "/cluster/gfs_controld/@drop_resources_time"
+#define DROP_RESOURCES_COUNT_PATH "/cluster/gfs_controld/@drop_resources_count"
+#define DROP_RESOURCES_AGE_PATH "/cluster/gfs_controld/@drop_resources_age"
+
+/* These config values are set from cluster.conf only if they haven't already
+   been set on the command line. */
+
+void read_ccs(void)
+{
+	int cd;
+
+	cd = open_ccs();
+	if (cd < 0)
+		return;
+
+	if (!optd_groupd_compat)
+		read_ccs_int(cd, GROUPD_COMPAT_PATH, &cfgd_groupd_compat);
+	if (!optd_enable_withdraw)
+		read_ccs_int(cd, ENABLE_WITHDRAW_PATH, &cfgd_enable_withdraw);
+	if (!optd_enable_plock)
+		read_ccs_int(cd, ENABLE_PLOCK_PATH, &cfgd_enable_plock);
+	if (!optd_plock_debug)
+		read_ccs_int(cd, PLOCK_DEBUG_PATH, &cfgd_plock_debug);
+	if (!optd_plock_rate_limit)
+		read_ccs_int(cd, PLOCK_RATE_LIMIT_PATH, &cfgd_plock_rate_limit);
+	if (!optd_plock_ownership)
+		read_ccs_int(cd, PLOCK_OWNERSHIP_PATH, &cfgd_plock_ownership);
+	if (!optd_drop_resources_time)
+		read_ccs_int(cd, DROP_RESOURCES_TIME_PATH, &cfgd_drop_resources_time);
+	if (!optd_drop_resources_count)
+		read_ccs_int(cd, DROP_RESOURCES_COUNT_PATH, &cfgd_drop_resources_count);
+	if (!optd_drop_resources_age)
+		read_ccs_int(cd, DROP_RESOURCES_AGE_PATH, &cfgd_drop_resources_age);
+
+	ccs_disconnect(cd);
+}
+
+#define LOCKSPACE_NODIR "/cluster/dlm/lockspace[@name=\"%s\"]/@nodir"
+
+void read_ccs_nodir(struct mountgroup *mg, char *buf)
+{
+	char path[PATH_MAX];
+	char *str;
+	int val;
+	int error;
+	int cd;
+
+	cd = open_ccs();
+	if (cd < 0)
+		return;
+
+	memset(path, 0, PATH_MAX);
+	sprintf(path, LOCKSPACE_NODIR, mg->name);
+
+	error = ccs_get(cd, path, &str);
+	if (error || !str)
+		return;
+
+	val = atoi(str);
+
+	if (val < 0) {
+		log_error("ignore invalid value %d for %s", val, path);
+		return;
+	}
+
+	snprintf(buf, 32, ":nodir=%d", val);
+
+	log_debug("%s is %u", path, val);
+	free(str);
+
+	ccs_disconnect(cd);
+}
+
diff --git a/group/gfs_controld/config.h b/group/gfs_controld/config.h
new file mode 100644
index 0000000..a6c50b6
--- /dev/null
+++ b/group/gfs_controld/config.h
@@ -0,0 +1,47 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+#define DEFAULT_GROUPD_COMPAT 1
+#define DEFAULT_ENABLE_WITHDRAW 1
+#define DEFAULT_ENABLE_PLOCK 1
+#define DEFAULT_PLOCK_DEBUG 0
+#define DEFAULT_PLOCK_RATE_LIMIT 100
+#define DEFAULT_PLOCK_OWNERSHIP 1
+#define DEFAULT_DROP_RESOURCES_TIME 10000 /* 10 sec */
+#define DEFAULT_DROP_RESOURCES_COUNT 10
+#define DEFAULT_DROP_RESOURCES_AGE 10000 /* 10 sec */
+
+extern int optd_groupd_compat;
+extern int optd_enable_withdraw;
+extern int optd_enable_plock;
+extern int optd_plock_debug;
+extern int optd_plock_rate_limit;
+extern int optd_plock_ownership;
+extern int optd_drop_resources_time;
+extern int optd_drop_resources_count;
+extern int optd_drop_resources_age;
+
+extern int cfgd_groupd_compat;
+extern int cfgd_enable_withdraw;
+extern int cfgd_enable_plock;
+extern int cfgd_plock_debug;
+extern int cfgd_plock_rate_limit;
+extern int cfgd_plock_ownership;
+extern int cfgd_drop_resources_time;
+extern int cfgd_drop_resources_count;
+extern int cfgd_drop_resources_age;
+
+#endif
+
diff --git a/group/gfs_controld/cpg-old.c b/group/gfs_controld/cpg-old.c
new file mode 100644
index 0000000..72978ee
--- /dev/null
+++ b/group/gfs_controld/cpg-old.c
@@ -0,0 +1,2686 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "gfs_daemon.h"
+#include "config.h"
+#include "cpg-old.h"
+#include "libgroup.h"
+
+#define JID_INIT	-9
+
+/* mg_member opts bit field */
+
+enum {
+	MEMB_OPT_RW = 1,
+	MEMB_OPT_RO = 2,
+	MEMB_OPT_SPECT = 4,
+	MEMB_OPT_RECOVER = 8,
+};
+
+/* mg_member state: local_recovery_status, recovery_status */
+
+enum {
+	RS_NEED_RECOVERY = 1,
+	RS_SUCCESS,
+	RS_GAVEUP,
+	RS_NOFS,
+	RS_READONLY,
+};
+
+struct mg_member {
+	struct list_head	list;
+	int			nodeid;
+	int			jid;
+
+	int			spectator;
+	int			readonly;
+	int			rw;
+	uint32_t		opts;
+
+	int			tell_gfs_to_recover;
+	int			wait_gfs_recover_done;
+	int			gone_event;
+	int			gone_type;
+	int			finished;
+	int			local_recovery_status;
+	int			recovery_status;
+	int			withdrawing;
+	int			needs_journals;
+
+	int			ms_kernel_mount_done;
+	int			ms_first_mounter;
+	int			ms_kernel_mount_error;
+};
+
+extern group_handle_t gh;
+
+int message_flow_control_on;
+
+/* cpg message protocol
+   1.0.0 is initial version
+   2.0.0 is incompatible with 1.0.0 and allows plock ownership */
+static unsigned int protocol_v100[3] = {1, 0, 0};
+static unsigned int protocol_v200[3] = {2, 0, 0};
+static unsigned int protocol_active[3];
+
+static struct list_head withdrawn_mounts;
+static cpg_handle_t	daemon_handle;
+static struct cpg_name	daemon_name;
+
+
+static void send_journals(struct mountgroup *mg, int nodeid);
+
+
+static char *msg_name(int type)
+{
+	switch (type) {
+	case MSG_JOURNAL:
+		return "MSG_JOURNAL";
+	case MSG_OPTIONS:
+		return "MSG_OPTIONS";
+	case MSG_REMOUNT:
+		return "MSG_REMOUNT";
+	case MSG_PLOCK:
+		return "MSG_PLOCK";
+	case MSG_MOUNT_STATUS:
+		return "MSG_MOUNT_STATUS";
+	case MSG_RECOVERY_STATUS:
+		return "MSG_RECOVERY_STATUS";
+	case MSG_RECOVERY_DONE:
+		return "MSG_RECOVERY_DONE";
+	case MSG_WITHDRAW:
+		return "MSG_WITHDRAW";
+	}
+	return "unknown";
+}
+
+static int _send_message(cpg_handle_t h, void *buf, int len, int type)
+{
+	struct iovec iov;
+	cpg_error_t error;
+	int retries = 0;
+
+	iov.iov_base = buf;
+	iov.iov_len = len;
+
+ retry:
+	error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
+	if (error == CPG_ERR_TRY_AGAIN) {
+		retries++;
+		usleep(1000);
+		if (!(retries % 100))
+			log_error("cpg_mcast_joined retry %d %s",
+				   retries, msg_name(type));
+		goto retry;
+	}
+	if (error != CPG_OK) {
+		log_error("cpg_mcast_joined error %d handle %llx %s",
+			  error, (unsigned long long)h, msg_name(type));
+		return -1;
+	}
+
+	if (retries)
+		log_debug("cpg_mcast_joined retried %d %s",
+			  retries, msg_name(type));
+
+	return 0;
+}
+
+int send_group_message_old(struct mountgroup *mg, int len, char *buf)
+{
+	struct gdlm_header *hd = (struct gdlm_header *) buf;
+	int type = hd->type;
+
+	hd->version[0]	= cpu_to_le16(protocol_active[0]);
+	hd->version[1]	= cpu_to_le16(protocol_active[1]);
+	hd->version[2]	= cpu_to_le16(protocol_active[2]);
+	hd->type	= cpu_to_le16(hd->type);
+	hd->nodeid	= cpu_to_le32(hd->nodeid);
+	hd->to_nodeid	= cpu_to_le32(hd->to_nodeid);
+	memcpy(hd->name, mg->name, strlen(mg->name));
+
+	return _send_message(daemon_handle, buf, len, type);
+}
+
+static struct mg_member *find_memb_nodeid(struct mountgroup *mg, int nodeid)
+{
+	struct mg_member *memb;
+
+	list_for_each_entry(memb, &mg->members, list) {
+		if (memb->nodeid == nodeid)
+			return memb;
+	}
+	return NULL;
+}
+
+static struct mg_member *find_memb_jid(struct mountgroup *mg, int jid)
+{
+	struct mg_member *memb;
+
+	list_for_each_entry(memb, &mg->members, list) {
+		if (memb->jid == jid)
+			return memb;
+	}
+	return NULL;
+}
+
+static void notify_mount_client(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+
+	if (!mg->mount_client_result && mg->mount_client_delay) {
+		log_group(mg, "notify_mount_client delayed");
+		return;
+	}
+
+	client_reply_join_full(mg, mg->mount_client_result);
+
+	if (mg->mount_client_result) {
+		log_group(mg, "leaving due to mount error: %d",
+			  mg->mount_client_result);
+
+		memb = find_memb_nodeid(mg, our_nodeid);
+		if (memb->finished)
+			group_leave(gh, mg->name);
+		else {
+			log_group(mg, "delay leave until after join");
+			mg->group_leave_on_finish = 1;
+		}
+	} else {
+		mg->mount_client_notified = 1;
+	}
+}
+
+/* we can receive recovery_status messages from other nodes doing start before
+   we actually process the corresponding start callback ourselves */
+
+void save_message_old(struct mountgroup *mg, char *buf, int len, int from,
+		      int type)
+{
+	struct save_msg *sm;
+
+	sm = malloc(sizeof(struct save_msg) + len);
+	if (!sm)
+		return;
+	memset(sm, 0, sizeof(struct save_msg) + len);
+
+	memcpy(&sm->buf, buf, len);
+	sm->type = type;
+	sm->len = len;
+	sm->nodeid = from;
+
+	log_group(mg, "save %s from %d len %d", msg_name(type), from, len);
+
+	list_add_tail(&sm->list, &mg->saved_messages);
+}
+
+static int first_mounter_recovery(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+
+	list_for_each_entry(memb, &mg->members, list) {
+		if (memb->opts & MEMB_OPT_RECOVER)
+			return memb->nodeid;
+	}
+	return 0;
+}
+
+static int local_first_mounter_recovery(struct mountgroup *mg)
+{
+	int nodeid;
+
+	nodeid = first_mounter_recovery(mg);
+	if (nodeid == our_nodeid)
+		return 1;
+	return 0;
+}
+
+int remote_first_mounter_recovery(struct mountgroup *mg)
+{
+	int nodeid;
+
+	nodeid = first_mounter_recovery(mg);
+	if (nodeid && (nodeid != our_nodeid))
+		return 1;
+	return 0;
+}
+
+static void start_done(struct mountgroup *mg)
+{
+	log_group(mg, "start_done %d", mg->start_event_nr);
+	group_start_done(gh, mg->name, mg->start_event_nr);
+}
+
+void send_withdraw_old(struct mountgroup *mg)
+{
+	struct gdlm_header *hd;
+	int len;
+	char *buf;
+
+	len = sizeof(struct gdlm_header);
+
+	buf = malloc(len);
+	if (!buf)
+		return;
+	memset(buf, 0, len);
+
+	hd = (struct gdlm_header *)buf;
+	hd->type = MSG_WITHDRAW;
+	hd->nodeid = our_nodeid;
+	hd->to_nodeid = 0;
+
+	log_group(mg, "send_withdraw");
+
+	send_group_message_old(mg, len, buf);
+
+	free(buf);
+}
+
+static void receive_withdraw(struct mountgroup *mg, char *buf, int len, int from)
+{
+	struct mg_member *memb;
+
+	memb = find_memb_nodeid(mg, from);
+	if (!memb) {
+		log_group(mg, "receive_withdraw no member %d", from);
+		return;
+	}
+	log_group(mg, "receive_withdraw from %d", from);
+	memb->withdrawing = 1;
+
+	if (from == our_nodeid)
+		group_leave(gh, mg->name);
+}
+
+#define SEND_RS_INTS 3
+
+static void send_recovery_status(struct mountgroup *mg)
+{
+	struct gdlm_header *hd;
+	struct mg_member *memb;
+	int len, *p, i, n = 0;
+	char *buf;
+
+	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (memb->local_recovery_status == RS_SUCCESS)
+			n++;
+	}
+
+	len = sizeof(struct gdlm_header) + (n * SEND_RS_INTS * sizeof(int));
+
+	buf = malloc(len);
+	if (!buf)
+		return;
+	memset(buf, 0, len);
+
+	hd = (struct gdlm_header *)buf;
+	hd->type = MSG_RECOVERY_STATUS;
+	hd->nodeid = our_nodeid;
+	hd->to_nodeid = 0;
+	p = (int *) (buf + sizeof(struct gdlm_header));
+
+	i = 0;
+	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (memb->local_recovery_status != RS_SUCCESS)
+			continue;
+		p[i] = cpu_to_le32(memb->nodeid);
+		i++;
+		p[i] = cpu_to_le32(memb->jid);
+		i++;
+		p[i] = cpu_to_le32(memb->local_recovery_status);
+		i++;
+	}
+
+	log_group(mg, "send_recovery_status for %d nodes len %d", n, len);
+
+	send_group_message_old(mg, len, buf);
+
+	free(buf);
+}
+
+/* Note: we can get more than one node reporting success in recovering
+   the journal for a failed node.  The first has really recovered it,
+   the rest have found the fs clean and report success. */
+
+static void _receive_recovery_status(struct mountgroup *mg, char *buf, int len,
+			      int from)
+{
+	struct mg_member *memb;
+	int *p, n, i, nodeid, jid, status, found = 0;
+
+	n = (len - sizeof(struct gdlm_header)) / (SEND_RS_INTS * sizeof(int));
+
+	p = (int *) (buf + sizeof(struct gdlm_header));
+
+	for (i = 0; i < n; i++) {
+		nodeid = le32_to_cpu(p[i * SEND_RS_INTS]);
+		jid    = le32_to_cpu(p[i * SEND_RS_INTS + 1]);
+		status = le32_to_cpu(p[i * SEND_RS_INTS + 2]);
+
+		ASSERT(status == RS_SUCCESS);
+
+		found = 0;
+		list_for_each_entry(memb, &mg->members_gone, list) {
+			if (memb->nodeid != nodeid)
+				continue;
+			ASSERT(memb->jid == jid);
+			ASSERT(memb->recovery_status == RS_NEED_RECOVERY ||
+			       memb->recovery_status == RS_SUCCESS);
+			memb->recovery_status = status;
+			found = 1;
+			break;
+		}
+
+		log_group(mg, "receive_recovery_status from %d len %d "
+			  "nodeid %d jid %d status %d found %d",
+			  from, len, nodeid, jid, status, found);
+	}
+
+	if (from == our_nodeid)
+		start_done(mg);
+}
+
+static void process_saved_recovery_status(struct mountgroup *mg)
+{
+	struct save_msg *sm, *sm2;
+
+	if (list_empty(&mg->saved_messages))
+		return;
+
+	log_group(mg, "process_saved_recovery_status");
+
+	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
+		if (sm->type != MSG_RECOVERY_STATUS)
+			continue;
+		_receive_recovery_status(mg, sm->buf, sm->len, sm->nodeid);
+		list_del(&sm->list);
+		free(sm);
+	}
+}
+
+static void assign_next_first_mounter(struct mountgroup *mg)
+{
+	struct mg_member *memb, *next = NULL;
+	int low = -1;
+
+	list_for_each_entry(memb, &mg->members, list) {
+		if (memb->jid == -2)
+			continue;
+		if (memb->jid == -9)
+			continue;
+		if (memb->spectator || memb->readonly || memb->withdrawing ||
+		    memb->ms_kernel_mount_done)
+			continue;
+		if (low == -1 || memb->nodeid < low) {
+			next = memb;
+			low = memb->nodeid;
+		}
+	}
+
+	if (next) {
+		log_group(mg, "next first mounter is %d jid %d opts %x",
+			  next->nodeid, next->jid, next->opts);
+		next->opts |= MEMB_OPT_RECOVER;
+		ASSERT(next->jid >= 0);
+	} else
+		log_group(mg, "no next mounter available yet");
+}
+
+#define SEND_MS_INTS 4
+
+static void send_mount_status(struct mountgroup *mg)
+{
+	struct gdlm_header *hd;
+	int len, *p;
+	char *buf;
+
+	len = sizeof(struct gdlm_header) + (SEND_MS_INTS * sizeof(int));
+
+	buf = malloc(len);
+	if (!buf)
+		return;
+	memset(buf, 0, len);
+
+	hd = (struct gdlm_header *)buf;
+	hd->type = MSG_MOUNT_STATUS;
+	hd->nodeid = our_nodeid;
+	hd->to_nodeid = 0;
+
+	p = (int *) (buf + sizeof(struct gdlm_header));
+
+	p[0] = cpu_to_le32(mg->first_mounter);
+	p[1] = cpu_to_le32(mg->kernel_mount_error);
+	p[2] = 0; /* unused */
+	p[3] = 0; /* unused */
+
+	log_group(mg, "send_mount_status kernel_mount_error %d "
+		      "first_mounter %d",
+		      mg->kernel_mount_error,
+		      mg->first_mounter);
+
+	send_group_message_old(mg, len, buf);
+
+	free(buf);
+}
+
+static void _receive_mount_status(struct mountgroup *mg, char *buf, int len,
+				  int from)
+{
+	struct mg_member *memb, *us;
+	int *p;
+
+	p = (int *) (buf + sizeof(struct gdlm_header));
+
+	memb = find_memb_nodeid(mg, from);
+	if (!memb) {
+		log_group(mg, "_receive_mount_status no node %d", from);
+		return;
+	}
+
+	memb->ms_kernel_mount_done = 1;
+	memb->ms_first_mounter = le32_to_cpu(p[0]);
+	memb->ms_kernel_mount_error = le32_to_cpu(p[1]);
+
+	log_group(mg, "_receive_mount_status from %d kernel_mount_error %d "
+		      "first_mounter %d opts %x", from,
+		      memb->ms_kernel_mount_error, memb->ms_first_mounter,
+		      memb->opts);
+
+	if (memb->opts & MEMB_OPT_RECOVER) {
+		ASSERT(memb->ms_first_mounter);
+	}
+	if (memb->ms_first_mounter) {
+		ASSERT(memb->opts & MEMB_OPT_RECOVER);
+	}
+
+	if (memb->ms_first_mounter) {
+		memb->opts &= ~MEMB_OPT_RECOVER;
+
+		if (!memb->ms_kernel_mount_error) {
+			/* the first mounter has successfully mounted, we can
+			   go ahead and mount now */
+
+			if (mg->mount_client_delay) {
+				mg->mount_client_delay = 0;
+				notify_mount_client(mg);
+			}
+		} else {
+			/* first mounter mount failed, next low node should be
+			   made first mounter */
+
+			memb->jid = -2;
+			if (from == our_nodeid)
+				mg->our_jid = -2;
+
+			assign_next_first_mounter(mg);
+
+			/* if we became the next first mounter, then notify
+			   mount client */
+
+			us = find_memb_nodeid(mg, our_nodeid);
+			if (us->opts & MEMB_OPT_RECOVER) {
+				log_group(mg, "we are next first mounter");
+				mg->first_mounter = 1;
+				mg->first_mounter_done = 0;
+				mg->mount_client_delay = 0;
+				notify_mount_client(mg);
+			}
+		}
+	}
+}
+
+static void receive_mount_status(struct mountgroup *mg, char *buf, int len,
+				 int from)
+{
+	log_group(mg, "receive_mount_status from %d len %d last_cb %d",
+		  from, len, mg->last_callback);
+
+	if (!mg->got_our_options) {
+		log_group(mg, "ignore mount_status from %d", from);
+		return;
+	}
+
+	if (!mg->got_our_journals)
+		save_message_old(mg, buf, len, from, MSG_MOUNT_STATUS);
+	else
+		_receive_mount_status(mg, buf, len, from);
+}
+
+/* We delay processing mount_status msesages until we receive the journals
+   message for our own mount.  Our journals message is a snapshot of the memb
+   list at the time our options message is received on the remote node.  We
+   ignore any messages that would change the memb list prior to seeing our own
+   options message and we save any messages that would change the memb list
+   after seeing our own options message and before we receive the memb list
+   from the journals message. */
+
+static void process_saved_mount_status(struct mountgroup *mg)
+{
+	struct save_msg *sm, *sm2;
+
+	if (list_empty(&mg->saved_messages))
+		return;
+
+	log_group(mg, "process_saved_mount_status");
+
+	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
+		if (sm->type != MSG_MOUNT_STATUS)
+			continue;
+		_receive_mount_status(mg, sm->buf, sm->len, sm->nodeid);
+		list_del(&sm->list);
+		free(sm);
+	}
+}
+
+static void receive_recovery_status(struct mountgroup *mg, char *buf, int len,
+			     int from)
+{
+	switch (mg->last_callback) {
+	case DO_STOP:
+		save_message_old(mg, buf, len, from, MSG_RECOVERY_STATUS);
+		break;
+	case DO_START:
+		_receive_recovery_status(mg, buf, len, from);
+		break;
+	default:
+		log_group(mg, "receive_recovery_status %d last_callback %d",
+			  from, mg->last_callback);
+	}
+}
+
+/* tell others that all journals are recovered; they should clear
+   memb's from members_gone, clear needs_recovery and unblock locks */
+
+static void send_recovery_done(struct mountgroup *mg)
+{
+	struct gdlm_header *hd;
+	int len;
+	char *buf;
+
+	len = sizeof(struct gdlm_header);
+
+	buf = malloc(len);
+	if (!buf)
+		return;
+	memset(buf, 0, len);
+
+	hd = (struct gdlm_header *)buf;
+	hd->type = MSG_RECOVERY_DONE;
+	hd->nodeid = our_nodeid;
+	hd->to_nodeid = 0;
+
+	send_group_message_old(mg, len, buf);
+
+	free(buf);
+}
+
+static void receive_recovery_done(struct mountgroup *mg, char *buf, int len,
+				  int from)
+{
+	struct mg_member *memb, *safe;
+
+	log_group(mg, "receive_recovery_done from %d needs_recovery %d",
+		  from, mg->needs_recovery);
+
+	list_for_each_entry_safe(memb, safe, &mg->members_gone, list) {
+		log_group(mg, "receive_recovery_done clear jid %d nodeid %d",
+			  memb->jid, memb->nodeid);
+		list_del(&memb->list);
+		free(memb);
+	}
+
+	mg->needs_recovery = 0;
+	set_sysfs(mg, "block", 0);
+}
+
+static void send_remount(struct mountgroup *mg, int ro)
+{
+	struct gdlm_header *hd;
+	int len;
+	char *buf;
+
+	len = sizeof(struct gdlm_header) + MAX_OPTIONS_LEN;
+
+	buf = malloc(len);
+	if (!buf)
+		return;
+	memset(buf, 0, len);
+
+	hd = (struct gdlm_header *)buf;
+	hd->type = MSG_REMOUNT;
+	hd->nodeid = our_nodeid;
+	hd->to_nodeid = 0;
+
+	strcpy(buf+sizeof(struct gdlm_header), ro ? "ro" : "rw");
+
+	log_group(mg, "send_remount len %d \"%s\"", len,
+		  buf+sizeof(struct gdlm_header));
+
+	send_group_message_old(mg, len, buf);
+
+	free(buf);
+}
+
+static void receive_remount(struct mountgroup *mg, char *buf, int len, int from)
+{
+	struct mg_member *memb;
+	char *options;
+	int rw = 0, ro = 0;
+	int result = 0;
+
+	options = (char *) (buf + sizeof(struct gdlm_header));
+
+	memb = find_memb_nodeid(mg, from);
+	if (!memb) {
+		log_error("receive_remount: unknown nodeid %d", from);
+		return;
+	}
+
+	if (strstr(options, "rw"))
+		rw = 1;
+	else if (strstr(options, "ro"))
+		ro = 1;
+	else {
+		result = -EINVAL;
+		goto out;
+	}
+
+	/* FIXME: check if we've even fully completed our normal mount yet
+	   (received our own mount-status?)  if not, then disallow remount */
+
+	/* FIXME: going ro->rw may mean we can now do journal or first-mounter
+	   recovery that we couldn't do before. */
+
+	memb->readonly = ro;
+	memb->rw = !ro;
+
+	if (ro) {
+		memb->opts &= ~MEMB_OPT_RW;
+		memb->opts |= MEMB_OPT_RO;
+	} else {
+		memb->opts &= ~MEMB_OPT_RO;
+		memb->opts |= MEMB_OPT_RW;
+	}
+ out:
+	if (from == our_nodeid) {
+		if (!result) {
+			mg->rw = memb->rw;
+			mg->readonly = memb->readonly;
+		}
+		client_reply_remount(mg, result);
+	}
+
+	log_group(mg, "receive_remount from %d rw=%d ro=%d opts=%x",
+		  from, memb->rw, memb->readonly, memb->opts);
+}
+
+static void set_our_memb_options(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+	memb = find_memb_nodeid(mg, our_nodeid);
+	ASSERT(memb);
+
+	if (mg->readonly) {
+		memb->readonly = 1;
+		memb->opts |= MEMB_OPT_RO;
+	} else if (mg->spectator) {
+		memb->spectator = 1;
+		memb->opts |= MEMB_OPT_SPECT;
+	} else if (mg->rw) {
+		memb->rw = 1;
+		memb->opts |= MEMB_OPT_RW;
+	}
+}
+
+static void send_options(struct mountgroup *mg)
+{
+	struct gdlm_header *hd;
+	int len;
+	char *buf;
+
+	len = sizeof(struct gdlm_header) + MAX_OPTIONS_LEN;
+
+	buf = malloc(len);
+	if (!buf)
+		return;
+	memset(buf, 0, len);
+
+	hd = (struct gdlm_header *)buf;
+	hd->type = MSG_OPTIONS;
+	hd->nodeid = our_nodeid;
+	hd->to_nodeid = 0;
+
+	strncpy(buf+sizeof(struct gdlm_header), mg->mount_args.options,
+		MAX_OPTIONS_LEN-1);
+
+	log_group(mg, "send_options len %d \"%s\"", len,
+		  buf+sizeof(struct gdlm_header));
+
+	send_group_message_old(mg, len, buf);
+
+	free(buf);
+}
+
+/* We set the new member's jid to the lowest unused jid.  If we're the lowest
+   existing member (by nodeid), then send jid info to the new node. */
+
+/* Look at rw/ro/spectator status of all existing mounters and whether
+   we need to do recovery.  Based on that, decide if the current mount
+   mode (ro/spectator) is permitted; if not, set jid = -2.  If spectator
+   mount and it's ok, set jid = -1.  If ro or rw mount and it's ok, set
+   real jid. */
+
+static int assign_journal(struct mountgroup *mg, struct mg_member *new)
+{
+	struct mg_member *memb, *memb_recover = NULL, *memb_mounted = NULL;
+	int i, total, rw_count, ro_count, spect_count, invalid_count;
+
+	total = rw_count = ro_count = spect_count = invalid_count = 0;
+
+	list_for_each_entry(memb, &mg->members, list) {
+		if (memb->nodeid == new->nodeid)
+			continue;
+		total++;
+		if (memb->jid == -2)
+			invalid_count++;
+		else if (memb->spectator)
+			spect_count++;
+		else if (memb->rw)
+			rw_count++;
+		else if (memb->readonly)
+			ro_count++;
+
+		if (memb->opts & MEMB_OPT_RECOVER) {
+			memb_recover = memb;
+			log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
+				  memb->nodeid);
+		}
+
+		if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
+			memb_mounted = memb;
+	}
+
+	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
+		  "needs_recovery %d", total, invalid_count, rw_count,
+		  ro_count, spect_count, mg->needs_recovery);
+
+	if (new->spectator) {
+		log_group(mg, "assign_journal: new spectator allowed");
+		new->jid = -1;
+		goto out;
+	}
+
+	for (i = 0; i < 1024; i++) {
+		memb = find_memb_jid(mg, i);
+		if (!memb) {
+			new->jid = i;
+			break;
+		}
+	}
+
+	/* Repeat first-mounter recovery: the fs has been mounted and in-use,
+	   but nodes have failed and none of the current mounters has been able
+	   to do recovery (all remaining nodes may be ro/spect for example).
+	   This puts us into the special "needs_recovery" state where new
+	   mounters are asked to do first-mounter recovery of the fs while
+	   the current mounters sit in a blocked state. */
+
+	if (mg->needs_recovery) {
+		if (!memb_recover) {
+			log_group(mg, "assign_journal: needs_recovery: "
+				  "new memb %d gets OPT_RECOVER",
+				  new->nodeid);
+			new->opts |= MEMB_OPT_RECOVER;
+		} else {
+			log_group(mg, "assign_journal: needs_recovery: "
+				  "new memb %d memb %d has OPT_RECOVER",
+				  new->nodeid, memb_recover->nodeid);
+		}
+		goto out;
+	}
+
+	/* Initial first-mounter recovery: the fs is coming online, the first
+	   mg member assumes first-mounter role and other nodes join the mg
+	   while the first-mounter is working.  These non-first mounters wait
+	   for the first-mounter to finish before notifying mount.gfs.  If the
+	   first-mounter fails, one of them will become the first-mounter. */
+
+	/* it shouldn't be possible to have someone doing first mounter
+	   recovery and also have someone with the fs fully mounted */
+
+	if (memb_mounted && memb_recover) {
+		log_group(mg, "memb_mounted %d memb_recover %d",
+			  memb_mounted->nodeid, memb_recover->nodeid);
+		ASSERT(0);
+	}
+
+	/* someone has successfully mounted the fs which means the fs doesn't
+	   need first mounter recovery */
+
+	if (memb_mounted) {
+		log_group(mg, "assign_journal: no first recovery needed %d",
+			  memb_mounted->nodeid);
+		goto out;
+	}
+
+	/* someone is currently doing first mounter recovery, they'll send
+	   mount_status when they're done letting everyone know the result */
+
+	if (memb_recover) {
+		log_group(mg, "assign_journal: %d doing first recovery",
+			  memb_recover->nodeid);
+		goto out;
+	}
+
+	/* when we received our journals, no one was flagged with OPT_RECOVER
+	   which means no first mounter recovery is needed or is current */
+
+	if (mg->global_first_recover_done) {
+		log_group(mg, "assign_journal: global_first_recover_done");
+		goto out;
+	}
+
+	/* no one has done kernel mount successfully and no one is doing first
+	   mounter recovery, the new node gets to try first mounter recovery */
+
+	log_group(mg, "kernel_mount_done %d kernel_mount_error %d "
+		      "first_mounter %d first_mounter_done %d",
+		      mg->kernel_mount_done, mg->kernel_mount_error,
+		      mg->first_mounter, mg->first_mounter_done);
+
+	log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
+		  "fs not mounted", new->nodeid);
+	new->opts |= MEMB_OPT_RECOVER;
+
+ out:
+	log_group(mg, "assign_journal: new member %d got jid %d opts %x",
+		  new->nodeid, new->jid, new->opts);
+
+	if (mg->master_nodeid == our_nodeid) {
+		store_plocks(mg, new->nodeid);
+		send_journals(mg, new->nodeid);
+	}
+	return 0;
+}
+
+static void _receive_options(struct mountgroup *mg, char *buf, int len,
+			     int from)
+{
+	struct mg_member *memb;
+	struct gdlm_header *hd;
+	char *options;
+
+	hd = (struct gdlm_header *)buf;
+	options = (char *) (buf + sizeof(struct gdlm_header));
+
+	memb = find_memb_nodeid(mg, from);
+	if (!memb) {
+		log_error("unknown nodeid %d for options message", from);
+		return;
+	}
+
+	if (strstr(options, "spectator")) {
+		memb->spectator = 1;
+		memb->opts |= MEMB_OPT_SPECT;
+	} else if (strstr(options, "rw")) {
+		memb->rw = 1;
+		memb->opts |= MEMB_OPT_RW;
+	} else if (strstr(options, "ro")) {
+		memb->readonly = 1;
+		memb->opts |= MEMB_OPT_RO;
+	}
+
+	log_group(mg, "_receive_options from %d rw=%d ro=%d spect=%d opts=%x",
+		  from, memb->rw, memb->readonly, memb->spectator, memb->opts);
+
+	assign_journal(mg, memb);
+}
+
+static void receive_options(struct mountgroup *mg, char *buf, int len, int from)
+{
+	struct gdlm_header *hd = (struct gdlm_header *)buf;
+	struct mg_member *memb;
+
+	log_group(mg, "receive_options from %d len %d last_cb %d",
+		  from, len, mg->last_callback);
+
+	if (hd->nodeid == our_nodeid) {
+		mg->got_our_options = 1;
+		mg->save_plocks = 1;
+		return;
+	}
+
+	if (!mg->got_our_options) {
+		log_group(mg, "ignore options from %d", from);
+		return;
+	}
+
+	/* we can receive an options message before getting the start
+	   that adds the mounting node that sent the options, or
+	   we can receive options messages before we get the journals
+	   message for out own mount */
+
+	memb = find_memb_nodeid(mg, from);
+
+	if (!memb || !mg->got_our_journals)
+		save_message_old(mg, buf, len, from, MSG_OPTIONS);
+	else
+		_receive_options(mg, buf, len, from);
+}
+
+static void process_saved_options(struct mountgroup *mg)
+{
+	struct save_msg *sm, *sm2;
+
+	if (list_empty(&mg->saved_messages))
+		return;
+
+	log_group(mg, "process_saved_options");
+
+	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
+		if (sm->type != MSG_OPTIONS)
+			continue;
+		_receive_options(mg, sm->buf, sm->len, sm->nodeid);
+		list_del(&sm->list);
+		free(sm);
+	}
+}
+
+#define NUM 3
+
+/* send nodeid/jid/opts of every member to nodeid */
+
+static void send_journals(struct mountgroup *mg, int nodeid)
+{
+	struct mg_member *memb;
+	struct gdlm_header *hd;
+	int i, len;
+	char *buf;
+	int *ids;
+
+	len = sizeof(struct gdlm_header) + (mg->memb_count * NUM * sizeof(int));
+
+	buf = malloc(len);
+	if (!buf)
+		return;
+	memset(buf, 0, len);
+
+	hd = (struct gdlm_header *)buf;
+	hd->type = MSG_JOURNAL;
+	hd->nodeid = our_nodeid;
+	hd->to_nodeid = nodeid;
+	ids = (int *) (buf + sizeof(struct gdlm_header));
+
+	i = 0;
+	list_for_each_entry(memb, &mg->members, list) {
+		ids[i] = cpu_to_le32(memb->nodeid);
+		i++;
+		ids[i] = cpu_to_le32(memb->jid);
+		i++;
+		ids[i] = cpu_to_le32(memb->opts);
+		i++;
+	}
+
+	log_group(mg, "send_journals to %d len %d count %d", nodeid, len, i);
+
+	send_group_message_old(mg, len, buf);
+
+	free(buf);
+}
+
+static void received_our_jid(struct mountgroup *mg)
+{
+	log_group(mg, "received_our_jid %d", mg->our_jid);
+
+	/* we've been given jid of -2 which means we're not permitted
+	   to mount the fs; probably because we're trying to mount readonly
+	   but the next mounter is required to be rw */
+
+	if (mg->our_jid == -2) {
+		mg->mount_client_result = -EUCLEAN;
+		goto out;
+	}
+
+	/* fs needs recovery and existing mounters can't recover it,
+	   i.e. they're spectator/readonly or the first mounter's
+	   mount(2) failed, so we're told to do first-mounter recovery
+	   on the fs. */
+
+	if (local_first_mounter_recovery(mg)) {
+		log_group(mg, "we're told to do first mounter recovery");
+		mg->first_mounter = 1;
+		mg->first_mounter_done = 0;
+		mg->mount_client_delay = 0;
+		mg->save_plocks = 0;
+		goto out;
+	} else if (remote_first_mounter_recovery(mg)) {
+		/* delay notifying mount client until we get a successful
+		   mount status from the first mounter */
+		log_group(mg, "other node doing first mounter recovery, "
+			  "set mount_client_delay");
+		mg->mount_client_delay = 1;
+		mg->save_plocks = 0;
+		return;
+	}
+
+	retrieve_plocks(mg);
+	mg->save_plocks = 0;
+	process_saved_plocks(mg);
+ out:
+	notify_mount_client(mg);
+}
+
+static void _receive_journals(struct mountgroup *mg, char *buf, int len,
+			      int from)
+{
+	struct mg_member *memb, *memb2;
+	struct gdlm_header *hd;
+	int *ids, count, i, nodeid, jid, opts;
+	int current_first_recover = 0;
+
+	hd = (struct gdlm_header *)buf;
+
+	count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int));
+	ids = (int *) (buf + sizeof(struct gdlm_header));
+
+	for (i = 0; i < count; i++) {
+		nodeid = le32_to_cpu(ids[i * NUM]);
+		jid    = le32_to_cpu(ids[i * NUM + 1]);
+		opts   = le32_to_cpu(ids[i * NUM + 2]);
+
+		log_debug("receive nodeid %d jid %d opts %x",
+			  nodeid, jid, opts);
+
+		memb = find_memb_nodeid(mg, nodeid);
+		memb2 = find_memb_jid(mg, jid);
+
+		if (!memb || memb2) {
+			log_error("invalid journals message "
+				  "nodeid %d jid %d opts %x",
+				  nodeid, jid, opts);
+		}
+		if (!memb)
+			continue;
+
+		memb->jid = jid;
+
+		if (nodeid == our_nodeid) {
+			mg->our_jid = jid;
+			/* set_our_memb_options() sets rest */
+			if (opts & MEMB_OPT_RECOVER)
+				memb->opts |= MEMB_OPT_RECOVER;
+		} else {
+			memb->opts = opts;
+			if (opts & MEMB_OPT_RO)
+				memb->readonly = 1;
+			else if (opts & MEMB_OPT_RW)
+				memb->rw = 1;
+			else if (opts & MEMB_OPT_SPECT)
+				memb->spectator = 1;
+		}
+
+		if (opts & MEMB_OPT_RECOVER)
+			current_first_recover = 1;
+	}
+
+	/* FIXME: use global_first_recover_done more widely instead of
+	   as a single special case */
+	if (!current_first_recover)
+		mg->global_first_recover_done = 1;
+
+	process_saved_mount_status(mg);
+
+	/* we delay processing any options messages from new mounters
+	   until after we receive the journals message for our own mount */
+
+	process_saved_options(mg);
+
+	received_our_jid(mg);
+}
+
+static void receive_journals(struct mountgroup *mg, char *buf, int len,
+			     int from)
+{
+	struct gdlm_header *hd = (struct gdlm_header *)buf;
+	struct mg_member *memb;
+	int count;
+
+	count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int));
+
+	log_group(mg, "receive_journals from %d to %d len %d count %d cb %d",
+		  from, hd->to_nodeid, len, count, mg->last_callback);
+
+	/* just like we can receive an options msg from a newly added node
+	   before we get the start adding it, we can receive the journals
+	   message sent to it before we get the start adding it */
+
+	memb = find_memb_nodeid(mg, hd->to_nodeid);
+	if (!memb) {
+		log_group(mg, "receive_journals from %d to unknown %d",
+			  from, hd->to_nodeid);
+		return;
+	}
+	memb->needs_journals = 0;
+
+	if (hd->to_nodeid && hd->to_nodeid != our_nodeid)
+		return;
+
+	if (mg->got_our_journals) {
+		log_group(mg, "receive_journals from %d duplicate", from);
+		return;
+	}
+	mg->got_our_journals = 1;
+
+	_receive_journals(mg, buf, len, from);
+}
+
+static void add_ordered_member(struct mountgroup *mg, struct mg_member *new)
+{
+	struct mg_member *memb = NULL;
+	struct list_head *tmp;
+	struct list_head *newlist = &new->list;
+	struct list_head *head = &mg->members;
+
+	list_for_each(tmp, head) {
+		memb = list_entry(tmp, struct mg_member, list);
+		if (new->nodeid < memb->nodeid)
+			break;
+	}
+
+	if (!memb)
+		list_add_tail(newlist, head);
+	else {
+		/* FIXME: can use list macro here */
+		newlist->prev = tmp->prev;
+		newlist->next = tmp;
+		tmp->prev->next = newlist;
+		tmp->prev = newlist;
+	}
+}
+
+static int add_member(struct mountgroup *mg, int nodeid)
+{
+	struct mg_member *memb;
+
+	memb = malloc(sizeof(struct mg_member));
+	if (!memb)
+		return -ENOMEM;
+
+	memset(memb, 0, sizeof(*memb));
+
+	memb->nodeid = nodeid;
+	memb->jid = JID_INIT;
+	add_ordered_member(mg, memb);
+	mg->memb_count++;
+
+	if (!mg->init)
+		memb->needs_journals = 1;
+
+	return 0;
+}
+
+static int is_member(struct mountgroup *mg, int nodeid)
+{
+	struct mg_member *memb;
+
+	list_for_each_entry(memb, &mg->members, list) {
+		if (memb->nodeid == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+static int is_removed(struct mountgroup *mg, int nodeid)
+{
+	struct mg_member *memb;
+
+	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (memb->nodeid == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+/* New mounters may be waiting for a journals message that a failed node (as
+   master) would have sent.  If the master failed and we're the new master,
+   then send a journals message to any nodes for whom we've not seen a journals
+   message.  We also need to checkpoint the plock state for the new nodes to
+   read after they get their journals message. */
+
+static void resend_journals(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+	int stored_plocks = 0;
+
+	list_for_each_entry(memb, &mg->members, list) {
+		if (!memb->needs_journals)
+			continue;
+
+		if (!stored_plocks) {
+			store_plocks(mg, memb->nodeid);
+			stored_plocks = 1;
+		}
+
+		log_group(mg, "resend_journals to %d", memb->nodeid);
+		send_journals(mg, memb->nodeid);
+	}
+}
+
+/* The master node is the member of the group with the lowest nodeid who
+   was also a member of the last "finished" group, i.e. a member of the
+   group the last time it got a finish callback.  The job of the master
+   is to send state info to new nodes joining the group, and doing that
+   requires that the master has all the state to send -- a new joining
+   node that has the lowest nodeid doesn't have any state, which is why
+   we add the "finished" requirement. */
+
+static void update_master_nodeid(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+	int new = -1, low = -1;
+
+	list_for_each_entry(memb, &mg->members, list) {
+		if (low == -1 || memb->nodeid < low)
+			low = memb->nodeid;
+		if (!memb->finished)
+			continue;
+		if (new == -1 || memb->nodeid < new)
+			new = memb->nodeid;
+	}
+	mg->master_nodeid = new;
+	mg->low_nodeid = low;
+}
+
+/* This can happen before we receive a journals message for our mount. */
+
+static void recover_members(struct mountgroup *mg, int num_nodes,
+			    int *nodeids, int *pos_out, int *neg_out)
+{
+	struct mg_member *memb, *safe, *memb_gone_recover = NULL;
+	int i, found, id, pos = 0, neg = 0, prev_master_nodeid;
+	int master_failed = 0;
+
+	/* move departed nodes from members list to members_gone */
+
+	list_for_each_entry_safe(memb, safe, &mg->members, list) {
+		found = 0;
+		for (i = 0; i < num_nodes; i++) {
+			if (memb->nodeid == nodeids[i]) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (!found) {
+			neg++;
+
+			list_move(&memb->list, &mg->members_gone);
+			memb->gone_event = mg->start_event_nr;
+			memb->gone_type = mg->start_type;
+			mg->memb_count--;
+
+			memb->tell_gfs_to_recover = 0;
+			memb->recovery_status = 0;
+			memb->local_recovery_status = 0;
+
+			/* - journal cb for failed or withdrawing nodes
+			   - failed node was assigned a journal
+			   - no journal cb if failed node was spectator
+			   - no journal cb if we've already done a journl cb */
+
+			if ((memb->gone_type == GROUP_NODE_FAILED ||
+			    memb->withdrawing) &&
+			    memb->jid != JID_INIT &&
+			    memb->jid != -2 &&
+			    !memb->spectator &&
+			    !memb->wait_gfs_recover_done) {
+				memb->tell_gfs_to_recover = 1;
+				memb->recovery_status = RS_NEED_RECOVERY;
+				memb->local_recovery_status = RS_NEED_RECOVERY;
+			}
+
+			log_group(mg, "remove member %d tell_gfs_to_recover %d "
+				  "(%d,%d,%d,%d,%d,%d)",
+				  memb->nodeid, memb->tell_gfs_to_recover,
+				  mg->spectator,
+				  mg->start_type,
+				  memb->withdrawing,
+				  memb->jid,
+				  memb->spectator,
+				  memb->wait_gfs_recover_done);
+
+			if (mg->master_nodeid == memb->nodeid &&
+			    memb->gone_type == GROUP_NODE_FAILED)
+				master_failed = 1;
+
+			if (memb->opts & MEMB_OPT_RECOVER)
+				memb_gone_recover = memb;
+		}
+	}
+
+	/* add new nodes to members list */
+
+	for (i = 0; i < num_nodes; i++) {
+		id = nodeids[i];
+		if (is_member(mg, id))
+			continue;
+		add_member(mg, id);
+		pos++;
+		log_group(mg, "add member %d", id);
+	}
+
+	prev_master_nodeid = mg->master_nodeid;
+	update_master_nodeid(mg);
+
+	*pos_out = pos;
+	*neg_out = neg;
+
+	log_group(mg, "total members %d master_nodeid %d prev %d",
+		  mg->memb_count, mg->master_nodeid, prev_master_nodeid);
+
+
+	/* The master failed and we're the new master, we need to:
+
+	   - unlink the ckpt that the failed master had open so new ckpts
+	     can be created down the road
+	   - resend journals msg to any nodes that needed one from the
+	     failed master
+	   - store plocks in ckpt for the new mounters to read when they
+	     get the journals msg from us */
+
+	if (neg && master_failed &&
+	    (prev_master_nodeid != -1) &&
+	    (prev_master_nodeid != mg->master_nodeid) &&
+	    (our_nodeid == mg->master_nodeid)) {
+		log_group(mg, "unlink ckpt for failed master %d",
+			  prev_master_nodeid);
+		unlink_checkpoint(mg);
+		resend_journals(mg);
+	}
+
+	/* Do we need a new first mounter?
+
+	   If we've not gotten a journals message yet (implies we're mounting)
+	   and there's only one node left in the group (us, after removing the
+	   failed node), then it's possible that the failed node was doing
+	   first mounter recovery, so we need to become first mounter.
+
+	   If we've received a journals message, we can check if the failed
+	   node was doing first mounter recovery (MEMB_OPT_RECOVER set) and
+	   if so select the next first mounter. */
+
+	if (!neg)
+		return;
+
+	if (!mg->got_our_journals && mg->memb_count == 1) {
+		log_group(mg, "we are left alone, act as first mounter");
+		unlink_checkpoint(mg);
+		memb = find_memb_nodeid(mg, our_nodeid);
+		memb->jid = 0;
+		memb->opts |= MEMB_OPT_RECOVER;
+		mg->our_jid = 0;
+		mg->first_mounter = 1;
+		mg->first_mounter_done = 0;
+		mg->got_our_options = 1;
+		mg->got_our_journals = 1;
+		mg->mount_client_delay = 0;
+		notify_mount_client(mg);
+		return;
+	}
+
+	if (memb_gone_recover) {
+		log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
+			  memb_gone_recover->nodeid);
+		memb_gone_recover->tell_gfs_to_recover = 0;
+	}
+
+	if (memb_gone_recover && mg->got_our_journals) {
+		assign_next_first_mounter(mg);
+		memb = find_memb_nodeid(mg, our_nodeid);
+		if (memb->opts & MEMB_OPT_RECOVER) {
+			log_group(mg, "first mounter failed, we get "
+				  "MEMB_OPT_RECOVER");
+			unlink_checkpoint(mg);
+			memb->opts |= MEMB_OPT_RECOVER;
+			mg->first_mounter = 1;
+			mg->first_mounter_done = 0;
+			mg->mount_client_delay = 0;
+			notify_mount_client(mg);
+		}
+	}
+}
+
+int join_mountgroup_old(int ci, struct gfsc_mount_args *ma)
+{
+	struct mountgroup *mg = NULL;
+	char table2[PATH_MAX];
+	char *cluster = NULL, *name = NULL;
+	int rv;
+
+	log_debug("join: %s %s %s %s %s %s",
+		  ma->dir, ma->type, ma->proto, ma->table,
+		  ma->options, ma->dev);
+
+	if (strcmp(ma->proto, "lock_dlm")) {
+		log_error("join: lockproto %s not supported", ma->proto);
+		rv = -EPROTONOSUPPORT;
+		goto fail;
+	}
+
+	if (strstr(ma->options, "jid=") ||
+	    strstr(ma->options, "first=") ||
+	    strstr(ma->options, "id=")) {
+		log_error("join: jid, first and id are reserved options");
+		rv = -EOPNOTSUPP;
+		goto fail;
+	}
+
+	/* table is <cluster>:<name> */
+
+	memset(table2, 0, sizeof(table2));
+	strncpy(table2, ma->table, sizeof(table2));
+
+	name = strstr(table2, ":");
+	if (!name) {
+		rv = -EBADFD;
+		goto fail;
+	}
+
+	*name = '\0';
+	name++;
+	cluster = table2;
+
+	if (strlen(name) > GFS_MOUNTGROUP_LEN) {
+		rv = -ENAMETOOLONG;
+		goto fail;
+	}
+
+	mg = find_mg(name);
+	if (mg) {
+		if (strcmp(mg->mount_args.dev, ma->dev)) {
+			log_error("different fs dev %s with same name",
+				  mg->mount_args.dev);
+			rv = -EADDRINUSE;
+		} else if (mg->reject_mounts) {
+			/* fs is being unmounted */
+			log_error("join: reject mount due to unmount");
+			rv = -ESTALE;
+		} else if (mg->mount_client || !mg->kernel_mount_done) {
+			log_error("join: other mount in progress %d %d",
+				  mg->mount_client, mg->kernel_mount_done);
+			rv = -EBUSY;
+		} else {
+			log_group(mg, "join: already mounted");
+			rv = -EALREADY;
+		}
+		goto fail;
+	}
+
+	mg = create_mg(name);
+	if (!mg) {
+		rv = -ENOMEM;
+		goto fail;
+	}
+	mg->mount_client = ci;
+	memcpy(&mg->mount_args, ma, sizeof(struct gfsc_mount_args));
+
+	if (strlen(cluster) != strlen(clustername) ||
+	    strlen(cluster) == 0 || strcmp(cluster, clustername)) {
+		log_error("join: fs requires cluster=\"%s\" current=\"%s\"",
+			  cluster, clustername);
+		rv = -EBADR;
+		goto fail_free;
+	}
+	log_group(mg, "join: cluster name matches: %s", clustername);
+
+	if (strstr(ma->options, "spectator")) {
+		log_group(mg, "join: spectator mount");
+		mg->spectator = 1;
+	} else {
+		if (!we_are_in_fence_domain()) {
+			log_error("join: not in default fence domain");
+			rv = -ENOANO;
+			goto fail_free;
+		}
+	}
+
+	if (!mg->spectator && strstr(ma->options, "rw"))
+		mg->rw = 1;
+	else if (strstr(ma->options, "ro")) {
+		if (mg->spectator) {
+			log_error("join: readonly invalid with spectator");
+			rv = -EROFS;
+			goto fail_free;
+		}
+		mg->readonly = 1;
+	}
+
+	if (strlen(ma->options) > MAX_OPTIONS_LEN-1) {
+		rv = -EMLINK;
+		log_error("mount: options too long %zu", strlen(ma->options));
+		goto fail_free;
+	}
+
+	list_add(&mg->list, &mountgroups);
+	group_join(gh, name);
+	return 0;
+
+ fail_free:
+	free(mg);
+ fail:
+	client_reply_join(ci, ma, rv);
+	return rv;
+}
+
+/* recover_members() discovers which nodes need journal recovery
+   and moves the memb structs for those nodes into members_gone
+   and sets memb->tell_gfs_to_recover on them */
+
+/* we don't want to tell gfs-kernel to do journal recovery for a failed
+   node in a number of cases:
+   - we're a spectator or readonly mount
+   - gfs-kernel is currently withdrawing
+   - we're mounting and haven't received a journals message yet
+   - we're mounting and got a kernel mount error back from mount.gfs
+   - we're mounting and haven't notified mount.gfs yet (to do mount(2))
+   - we're mounting and got_kernel_mount is 0, i.e. we've not seen a uevent
+     related to the kernel mount yet
+   (some of the mounting checks should be obviated by others)
+
+   the problem we're trying to avoid here is telling gfs-kernel to do
+   recovery when it can't for some reason and then waiting forever for
+   a recovery_done signal that will never arrive. */
+
+static void recover_journals(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+	int rv;
+
+	if (mg->spectator ||
+	    mg->readonly ||
+	    mg->withdraw ||
+	    mg->our_jid == JID_INIT ||
+	    mg->kernel_mount_error ||
+	    !mg->mount_client_notified ||
+	    !mg->got_kernel_mount ||
+	    !mg->kernel_mount_done) {
+		log_group(mg, "recover_journals: unable %d,%d,%d,%d,%d,%d,%d,%d",
+			  mg->spectator,
+			  mg->readonly,
+			  mg->withdraw,
+			  mg->our_jid,
+			  mg->kernel_mount_error,
+			  mg->mount_client_notified,
+			  mg->got_kernel_mount,
+			  mg->kernel_mount_done);
+
+		list_for_each_entry(memb, &mg->members_gone, list) {
+			log_group(mg, "member gone %d jid %d "
+				  "tell_gfs_to_recover %d",
+				  memb->nodeid, memb->jid,
+				  memb->tell_gfs_to_recover);
+
+			if (memb->tell_gfs_to_recover) {
+				memb->tell_gfs_to_recover = 0;
+				memb->local_recovery_status = RS_READONLY;
+			}
+		}
+		start_done(mg);
+		return;
+	}
+
+	/* we feed one jid into the kernel for recovery instead of all
+	   at once because we need to get the result of each independently
+	   through the single recovery_done sysfs file */
+
+	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (memb->wait_gfs_recover_done) {
+			log_group(mg, "delay new gfs recovery, "
+				  "wait_gfs_recover_done for nodeid %d jid %d",
+				  memb->nodeid, memb->jid);
+			return;
+		}
+	}
+
+	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (!memb->tell_gfs_to_recover)
+			continue;
+
+		log_group(mg, "recover journal %d nodeid %d",
+			  memb->jid, memb->nodeid);
+
+		rv = set_sysfs(mg, "recover", memb->jid);
+		if (rv < 0) {
+			memb->local_recovery_status = RS_NOFS;
+			continue;
+		}
+		memb->tell_gfs_to_recover = 0;
+		memb->wait_gfs_recover_done = 1;
+		return;
+	}
+
+	/* no more journals to attempt to recover, if we've been successful
+	   recovering any then send out status, if not then start_done...
+	   receiving no status message from us before start_done means we
+	   didn't successfully recover any journals.  If we send out status,
+	   then delay start_done until we get our own message (so all nodes
+	   will get the status before finish) */
+
+	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (memb->local_recovery_status == RS_SUCCESS) {
+			send_recovery_status(mg);
+			log_group(mg, "delay start_done until status recvd");
+			return;
+		}
+	}
+
+	start_done(mg);
+}
+
+/* In some cases, we may be joining a mountgroup with needs_recovery
+   set (there are journals that need recovery and current members can't
+   recover them because they're ro).  In this case, we're told to act
+   like the first mounter to cause gfs to try to recovery all journals
+   when it mounts.  When gfs does this, we'll get recovery_done's for
+   the individual journals it recovers (ignored) and finally, if all
+   journals are ok, an others_may_mount/first_done. */
+
+/* When gfs does first-mount recovery, the mount(2) fails if it can't
+   recover one of the journals.  If we get o_m_m, then we know it was
+   able to successfully recover all the journals. */
+
+/* When we're the first mounter, gfs does recovery on all the journals
+   and does "recovery_done" callbacks when it finishes each.  We ignore
+   these and wait for gfs to be finished with all at which point it calls
+   others_may_mount() and first_done is set. */
+
+static int kernel_recovery_done_first(struct mountgroup *mg)
+{
+	int rv, first_done;
+
+	rv = read_sysfs_int(mg, "first_done", &first_done);
+	if (rv < 0)
+		return rv;
+
+	log_group(mg, "kernel_recovery_done_first first_done %d", first_done);
+
+	if (mg->kernel_mount_done)
+		log_group(mg, "FIXME: assuming kernel_mount_done comes after "
+			  "first_done");
+
+	if (first_done) {
+		mg->first_mounter_done = 1;
+		send_recovery_done(mg);
+	}
+
+	return 0;
+}
+
+static int need_kernel_recovery_done(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+
+	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (memb->wait_gfs_recover_done)
+			return 1;
+	}
+	return 0;
+}
+
+/* Note: when a readonly node fails we do consider its journal (and the
+   fs) to need recovery... not sure this is really necessary, but
+   the readonly node did "own" a journal so it seems proper to recover
+   it even if the node wasn't writing to it.  So, if there are 3 ro
+   nodes mounting the fs and one fails, gfs on the remaining 2 will
+   remain blocked until an rw node mounts, and the next mounter must
+   be rw. */
+
+int kernel_recovery_done_old(char *table)
+{
+	struct mountgroup *mg;
+	struct mg_member *memb;
+	char *name = strstr(table, ":") + 1;
+	char *ss;
+	int rv, jid_done, status, found = 0;
+
+	mg = find_mg(name);
+	if (!mg) {
+		log_error("recovery_done: unknown mount group %s", table);
+		return -1;
+	}
+
+	if (mg->first_mounter && !mg->first_mounter_done)
+		return kernel_recovery_done_first(mg);
+
+	rv = read_sysfs_int(mg, "recover_done", &jid_done);
+	if (rv < 0)
+		return rv;
+
+	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (memb->jid == jid_done) {
+			if (memb->wait_gfs_recover_done) {
+				memb->wait_gfs_recover_done = 0;
+				found = 1;
+			}
+			break;
+		}
+	}
+
+	/* We need to ignore recovery_done callbacks in the case where there
+	   are a bunch of recovery_done callbacks for the first mounter, but
+	   we detect "first_done" before we've processed all the
+	   recovery_done's. */
+
+	if (!found) {
+		log_group(mg, "recovery_done jid %d ignored, first %d,%d",
+			  jid_done, mg->first_mounter, mg->first_mounter_done);
+		return 0;
+	}
+
+	rv = read_sysfs_int(mg, "recover_status", &status);
+	if (rv < 0) {
+		log_group(mg, "recovery_done jid %d nodeid %d sysfs error %d",
+			  memb->jid, memb->nodeid, rv);
+		memb->local_recovery_status = RS_NOFS;
+		goto out;
+	}
+
+	switch (status) {
+	case LM_RD_GAVEUP:
+		/*
+		 * This is unfortunate; it's needed for bz 442451 where
+		 * gfs-kernel fails to acquire the journal lock on all nodes
+		 * because a withdrawing node has not yet called
+		 * dlm_release_lockspace() to free it's journal lock.  With
+		 * this, all nodes should repeatedly try to to recover the
+		 * journal of the withdrawn node until the withdrawing node
+		 * clears its dlm locks, and gfs on each of the remaining nodes
+		 * succeeds in doing the recovery.
+		 */
+
+		if (memb->withdrawing) {
+			log_group(mg, "recovery_done jid %d nodeid %d retry "
+				  "for withdraw", memb->jid, memb->nodeid);
+			memb->tell_gfs_to_recover = 1;
+			memb->wait_gfs_recover_done = 0;
+			usleep(500000);
+		}
+
+		memb->local_recovery_status = RS_GAVEUP;
+		ss = "gaveup";
+		break;
+	case LM_RD_SUCCESS:
+		memb->local_recovery_status = RS_SUCCESS;
+		ss = "success";
+		break;
+	default:
+		log_error("recovery_done: jid %d nodeid %d unknown status %d",
+			  memb->jid, memb->nodeid, status);
+		ss = "unknown";
+	}
+
+	log_group(mg, "recovery_done jid %d nodeid %d %s",
+		  memb->jid, memb->nodeid, ss);
+
+	/* sanity check */
+	if (need_kernel_recovery_done(mg))
+		log_error("recovery_done: should be no pending gfs recoveries");
+
+ out:
+	recover_journals(mg);
+	return 0;
+}
+
+int remount_mountgroup_old(int ci, struct gfsc_mount_args *ma)
+{
+	struct mountgroup *mg;
+	char *name = strstr(ma->table, ":") + 1;
+	int ro = 0, rw = 0;
+
+	log_debug("remount: %s ci %d", name, ci);
+
+	if (!strncmp(ma->options, "ro", 2))
+		ro = 1;
+	else
+		rw = 1;
+
+	mg = find_mg(name);
+	if (!mg) {
+		log_error("remount: %s not found", name);
+		return -1;
+	}
+
+	/* no change */
+	if ((mg->readonly && ro) || (mg->rw && rw))
+		return 1;
+
+	mg->remount_client = ci;
+	send_remount(mg, ro);
+	return 0;
+}
+
+int leave_mountgroup_old(char *table, int mnterr)
+{
+	struct mountgroup *mg;
+	char *name = strstr(table, ":") + 1;
+
+	log_debug("leave: %s mnterr %d", name, mnterr);
+
+	list_for_each_entry(mg, &withdrawn_mounts, list) {
+		if (strcmp(mg->name, name))
+			continue;
+
+		log_group(mg, "leave: for withdrawn fs");
+		list_del(&mg->list);
+		free(mg);
+		return 0;
+	}
+
+	mg = find_mg(name);
+	if (!mg) {
+		log_error("leave: %s not found", name);
+		return -1;
+	}
+
+	if (mnterr) {
+		/* sanity check: we should already have gotten the error from
+		   the mount_result message sent by mount.gfs */
+		if (!mg->kernel_mount_error) {
+			log_group(mg, "leave: mount_error is new %d %d",
+				  mg->kernel_mount_error, mnterr);
+			mg->kernel_mount_error = mnterr;
+			mg->kernel_mount_done = 1;
+		}
+		goto out;
+	}
+
+	if (mg->withdraw) {
+		log_error("leave: %s is withdrawing", name);
+		return -1;
+	}
+
+	if (!mg->kernel_mount_done) {
+		log_error("leave: %s is still mounting", name);
+		return -1;
+	}
+
+	/* Check to see if we're waiting for a kernel recovery_done to do a
+	   start_done().  If so, call the start_done() here because we won't be
+	   getting anything else from gfs-kernel which is now gone. */
+
+	if (need_kernel_recovery_done(mg)) {
+		log_group(mg, "leave: fill in start_done");
+		start_done(mg);
+	}
+ out:
+	mg->reject_mounts = 1;
+	group_leave(gh, mg->name);
+	return 0;
+}
+
+void ping_kernel_mount_old(char *table)
+{
+	struct mountgroup *mg;
+	char *name = strstr(table, ":") + 1;
+	int rv, val;
+
+	mg = find_mg(name);
+	if (!mg)
+		return;
+
+	rv = read_sysfs_int(mg, "id", &val);
+
+	log_group(mg, "ping_kernel_mount %d", rv);
+}
+
+void mount_done_old(struct gfsc_mount_args *ma, int result)
+{
+	struct mountgroup *mg;
+	char *name = strstr(ma->table, ":") + 1;
+
+	mg = find_mg(name);
+	if (!mg) {
+		log_error("mount_done: %s not found", ma->table);
+		return;
+	}
+
+	log_group(mg, "mount_done: result %d first_mounter %d",
+		  result, mg->first_mounter);
+
+	mg->mount_client = 0;
+	mg->mount_client_fd = 0;
+
+	mg->kernel_mount_done = 1;
+	mg->kernel_mount_error = result;
+
+	send_mount_status(mg);
+}
+
+/* When mounting a fs, we first join the mountgroup, then tell mount.gfs
+   to procede with the kernel mount.  Once we're in the mountgroup, we
+   can get a stop callback at any time, which requires us to block the
+   fs by setting a sysfs file.  If the kernel mount is slow, we can get
+   a stop callback and try to set the sysfs file before the kernel mount
+   has actually created the sysfs files for the fs.  This function delays
+   any further processing until the sysfs files exist. */
+
+/* This function returns 0 when the kernel mount is successfully detected
+   and we know that do_stop() will be able to block the fs.
+   This function returns a negative error if it detects the kernel mount
+   has failed which means there's nothing to stop and do_stop() can assume
+   an implicit stop. */
+
+/* wait for
+   - kernel mount to get to the point of creating sysfs files we
+     can read (and that do_stop can then use), or
+   - kernel mount to fail causing mount.gfs to send us a MOUNT_DONE
+     which we read in process_connection() */
+
+static int wait_for_kernel_mount(struct mountgroup *mg)
+{
+	int rv, val;
+
+	while (1) {
+		/* This is the standard way we leave this loop, where the
+		   kernel mount gets to the point of creating the sysfs files
+		   which we see by successfully reading "id".  With the
+		   sysfs files in place, do_stop() will be able to block
+		   the kernel. */
+
+		rv = read_sysfs_int(mg, "id", &val);
+		if (!rv)
+			break;
+		usleep(100000);
+
+		/* kernel_mount_done is set by mount_done_old() which is called
+		   by process_connection() if mount.gfs sends MOUNT_DONE. */
+
+		if (mg->kernel_mount_done && !mg->kernel_mount_error) {
+			/* mount(2) was successful and we should be able
+			   to read "id" very shortly... */
+			continue;
+		}
+
+		if (mg->kernel_mount_done && mg->kernel_mount_error) {
+			/* mount(2) failed, stop becomes implicit */
+			break;
+		}
+
+		/* this should either do nothing and return immediatley, or
+		   read a MOUNT_DONE from mount.gfs and call mount_done_old()
+		   which will set kernel_mount_done and set kernel_mount_error */
+
+		process_connection(mg->mount_client);
+	}
+
+	return rv;
+}
+
+/* The processing of new mounters (send/recv options, send/recv journals,
+   notify mount.gfs) is not very integrated with the stop/start/finish
+   callbacks from libgroup.  A start callback just notifies us of a new
+   mounter and the options/journals messages drive things from there.
+   Recovery for failed nodes _is_ controlled more directly by the
+   stop/start/finish callbacks.  So, processing new mounters happens
+   independently of recovery and of the libgroup callbacks.  One place
+   where they need to intersect, though, is in stopping/suspending
+   gfs-kernel:
+   - When we get a stop callback, we need to be certain that gfs-kernel
+     is blocked.
+   - When a mounter notifies mount.gfs to go ahead, gfs-kernel will
+     shortly begin running in an unblocked fashion as it goes through
+     the kernel mounting process.
+   Given this, we need to be sure that if gfs-kernel is supposed to be
+   blocked, we don't notify mount.gfs to go ahead and do the kernel mount
+   since that starts gfs-kernel in an unblocked state. */
+
+/* - if we're unmounting, the kernel is gone, so no problem.
+   - if we've just mounted and notified mount.gfs, then wait for kernel
+     mount and then block.
+   - if we're mounting and have not yet notified mount.gfs, then set
+     a flag that delays the notification until block is set to 0. */
+
+int do_stop(struct mountgroup *mg)
+{
+	int rv;
+
+	if (mg->first_mounter && !mg->kernel_mount_done) {
+		log_group(mg, "do_stop skip during first mount recovery");
+		goto out;
+	}
+
+	for (;;) {
+		rv = set_sysfs(mg, "block", 1);
+		if (!rv)
+			break;
+
+		/* We get an error trying to block gfs, this could be due
+		   to a number of things:
+		   1. if the kernel instance of gfs existed before but now
+		      we can't see it, that must mean it's been unmounted,
+		      so it's implicitly stopped
+		   2. we're in the process of mounting and gfs hasn't created
+		      the sysfs files for this fs yet
+		   3. we're mounting and mount(2) returned an error
+		   4. we're mounting but haven't told mount.gfs to go ahead
+		      with mount(2) yet
+		   We also need to handle the situation where we get here in
+		   case 2 but it turns into case 3 while we're in
+		   wait_for_kernel_mount() */
+
+		if (mg->got_kernel_mount) {
+			log_group(mg, "do_stop skipped fs unmounted");
+			break;
+		}
+
+		if (mg->mount_client_notified) {
+			if (!mg->kernel_mount_error) {
+				log_group(mg, "do_stop wait for kernel mount");
+				rv = wait_for_kernel_mount(mg);
+				if (rv < 0)
+					break;
+			} else {
+				log_group(mg, "do_stop ignore, failed mount");
+				break;
+			}
+		} else {
+			log_group(mg, "do_stop causes mount_client_delay");
+			mg->mount_client_delay = 1;
+			break;
+		}
+	}
+ out:
+	group_stop_done(gh, mg->name);
+	return 0;
+}
+
+/*  After a start that initiated a recovery, everyone will go and see if they
+    can do recovery and try if they can.  If a node can't, it does start_done,
+    if it tries and fails, it does start_done, if it tries and succeeds it
+    sends a message and then does start_done once it receives's it back.  So,
+    when we get a finish we know that we have all the results from the recovery
+    cycle and can judge if everything is recovered properly or not.  If so, we
+    can unblock locks (in the finish), if not, we leave them blocked (in the
+    finish).
+
+    If we leave locks blocked in the finish, then they can only be unblocked
+    after someone is able to do the recovery that's needed.  So, leaving locks
+    blocked in a finish because recovery hasn't worked puts us into a special
+    state: the fs needs recovery, none of the current mounters has been able to
+    recover it, all current mounters have locks blocked in gfs, new mounters
+    are allowed, nodes can unmount, new mounters are asked to do first-mounter
+    recovery, if one of them succeeds then we can all clear this special state
+    and unblock locks (the unblock would happen upon recving the success
+    message from the new pseudo-first mounter, not as part of a finish), future
+    finishes would then go back to being able to unblock locks.
+
+    While in this special state, a new node has been added and asked to do
+    first-mounter recovery, other nodes can also be added while the new
+    first-mounter is active.  These other nodes don't notify mount.gfs.
+    They'll receive the result of the first mounter and if it succeeded they'll
+    notify mount.gfs, otherwise one of them will become the next first-mounter
+    and notify mount.gfs. */
+
+int do_finish(struct mountgroup *mg)
+{
+	struct mg_member *memb, *safe;
+
+	log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
+		  mg->needs_recovery);
+
+	/* members_gone list are the members that were removed from the
+	   members list when processing a start.  members are removed
+	   from members_gone if their journals have been recovered */
+
+	list_for_each_entry_safe(memb, safe, &mg->members_gone, list) {
+		if (!memb->recovery_status) {
+			list_del(&memb->list);
+			free(memb);
+		} else if (memb->recovery_status == RS_SUCCESS) {
+			ASSERT(memb->gone_event <= mg->last_finish);
+			log_group(mg, "finish: recovered jid %d nodeid %d",
+				  memb->jid, memb->nodeid);
+			list_del(&memb->list);
+			free(memb);
+		} else {
+			log_error("%s finish: needs recovery jid %d nodeid %d "
+				  "status %d", mg->name, memb->jid,
+				  memb->nodeid, memb->recovery_status);
+			mg->needs_recovery = 1;
+		}
+	}
+
+	list_for_each_entry(memb, &mg->members, list)
+		memb->finished = 1;
+
+	if (mg->group_leave_on_finish) {
+		log_group(mg, "leaving group after delay for join to finish");
+		group_leave(gh, mg->name);
+		mg->group_leave_on_finish = 0;
+		return 0;
+	}
+
+	if (!mg->needs_recovery) {
+		set_sysfs(mg, "block", 0);
+
+		/* we may have been holding back our local mount due to
+		   being stopped/blocked */
+		if (mg->mount_client_delay && !first_mounter_recovery(mg)) {
+			mg->mount_client_delay = 0;
+			notify_mount_client(mg);
+		}
+	} else
+		log_group(mg, "finish: leave locks blocked for needs_recovery");
+
+	return 0;
+}
+
+/*
+ * - require the first mounter to be rw, not ro or spectator.
+ *
+ * - if rw mounter fails, leaving only spectator mounters,
+ * require the next mounter to be rw, more ro/spectator mounts should
+ * fail until the fs is mounted rw.
+ *
+ * - if last rw mounter fails and ro mounters are left (possibly with
+ * some spectators), disallow any ro->rw remounts, leave gfs blocked,
+ * require next mounter to be rw, have next mounter do first mount
+ * gfs/journal recovery.
+ */
+
+/* called for the initial start on the node that's first to mount the fs.
+   (it should be ok to let the first mounter be a spectator, gfs should do
+   first recovery and bail out if there are any dirty journals) */
+
+/* FIXME: if journal recovery fails on any of the journals, we should
+   fail the mount */
+
+static void start_first_mounter(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+
+	log_group(mg, "start_first_mounter");
+	set_our_memb_options(mg);
+	memb = find_memb_nodeid(mg, our_nodeid);
+	ASSERT(memb);
+
+	if (mg->readonly || mg->spectator) {
+		memb->jid = -2;
+		mg->our_jid = -2;
+		log_group(mg, "start_first_mounter not rw ro=%d spect=%d",
+			  mg->readonly, mg->spectator);
+		mg->mount_client_result = -EUCLEAN;
+	} else {
+		memb->opts |= MEMB_OPT_RECOVER;
+		memb->jid = 0;
+		mg->our_jid = 0;
+		mg->first_mounter = 1;
+		mg->first_mounter_done = 0;
+		mg->got_our_options = 1;
+		mg->got_our_journals = 1;
+	}
+	start_done(mg);
+	notify_mount_client(mg);
+}
+
+/* called for the initial start on a rw/ro mounter;
+   the existing mounters are running start_participant() */
+
+static void start_participant_init(struct mountgroup *mg)
+{
+	log_group(mg, "start_participant_init");
+	set_our_memb_options(mg);
+	send_options(mg);
+	start_done(mg);
+}
+
+/* called for a non-initial start on a normal mounter.
+   NB we can get here without having received a journals message for
+   our (recent) mount yet in which case we don't know the jid or ro/rw
+   status of any members, and don't know our own jid. */
+
+static void start_participant(struct mountgroup *mg, int pos, int neg)
+{
+	log_group(mg, "start_participant pos=%d neg=%d", pos, neg);
+
+	if (pos) {
+		start_done(mg);
+		/* we save options messages from nodes for whom we've not
+		   received a start yet */
+		process_saved_options(mg);
+	} else if (neg) {
+		recover_journals(mg);
+		process_saved_recovery_status(mg);
+	}
+}
+
+/* called for the initial start on a spectator mounter,
+   after _receive_journals() */
+
+static void start_spectator_init_2(struct mountgroup *mg)
+{
+	log_group(mg, "start_spectator_init_2 our_jid=%d", mg->our_jid);
+
+	/* we've been given jid of -2 which means we're not permitted
+	   to mount the fs; probably because the next mounter must be rw */
+
+	if (mg->our_jid == -2) {
+		mg->mount_client_result = -EUCLEAN;
+	} else
+		ASSERT(mg->our_jid == -1);
+
+	notify_mount_client(mg);
+}
+
+/* called for the initial start on a spectator mounter */
+
+static void start_spectator_init(struct mountgroup *mg)
+{
+	log_group(mg, "start_spectator_init");
+	set_our_memb_options(mg);
+	send_options(mg);
+	start_done(mg);
+	mg->start2_fn = start_spectator_init_2;
+}
+
+/* called for a non-initial start on a spectator mounter */
+
+static void start_spectator(struct mountgroup *mg, int pos, int neg)
+{
+	log_group(mg, "start_spectator pos=%d neg=%d", pos, neg);
+
+	if (pos) {
+		start_done(mg);
+		process_saved_options(mg);
+	} else if (neg) {
+		recover_journals(mg);
+		process_saved_recovery_status(mg);
+	}
+}
+
+/* If nodeA fails, nodeB is recovering journalA and nodeB fails before
+   finishing, then nodeC needs to tell gfs to recover both journalA and
+   journalB.  We do this by setting tell_gfs_to_recover back to 1 for
+   any nodes that are still on the members_gone list. */
+
+static void reset_unfinished_recoveries(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+
+	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (memb->recovery_status &&
+		    memb->recovery_status != RS_NEED_RECOVERY) {
+			log_group(mg, "retry unfinished recovery "
+				  "jid %d nodeid %d",
+				  memb->jid, memb->nodeid);
+			memb->tell_gfs_to_recover = 1;
+			memb->recovery_status = RS_NEED_RECOVERY;
+			memb->local_recovery_status = RS_NEED_RECOVERY;
+		}
+	}
+}
+
+/*
+   old method:
+   A is rw mount, B mounts rw
+
+   do_start		do_start
+   start_participant	start_participant_init
+   			send_options
+   receive_options
+   start_participant_2
+   discover_journals
+   assign B a jid
+   send_journals
+   group_start_done
+   			receive_journals
+			start_participant_init_2
+			group_start_done
+   do_finish		do_finish
+
+   new method: decouples stop/start/finish from mount processing
+   A is rw mount, B mounts rw
+
+   do_start		do_start
+   start_participant	start_participant_init
+   start_done		send_options
+   			start_done
+   do_finish		do_finish
+
+   receive_options
+   assign_journal
+   send_journals
+   			receive_journals
+			start_participant_init_2
+			notify_mount_client
+*/
+
+void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids)
+{
+	int pos = 0, neg = 0;
+
+	mg->start_event_nr = mg->last_start;
+	mg->start_type = type;
+
+	log_group(mg, "start %d init %d type %d member_count %d",
+		  mg->last_start, mg->init, type, member_count);
+
+	recover_members(mg, member_count, nodeids, &pos, &neg);
+	reset_unfinished_recoveries(mg);
+
+	if (mg->init) {
+		if (member_count == 1)
+			start_first_mounter(mg);
+		else if (mg->spectator)
+			start_spectator_init(mg);
+		else
+			start_participant_init(mg);
+		mg->init = 0;
+	} else {
+		if (mg->spectator)
+			start_spectator(mg, pos, neg);
+		else
+			start_participant(mg, pos, neg);
+	}
+}
+
+/*
+  What repurcussions are there from umount shutting down gfs in the
+  kernel before we leave the mountgroup?  We can no longer participate
+  in recovery even though we're in the group -- what are the end cases
+  that we need to deal with where this causes a problem?  i.e. there
+  is a period of time where the mountgroup=A,B,C but the kernel fs
+  is only active on A,B, not C.  The mountgroup on A,B can't depend
+  on the mg on C to necessarily be able to do some things (recovery).
+
+  At least in part, it means that after we do an umount and have
+  removed the instance of this fs in the kernel, we'll still get
+  stop/start/finish callbacks from groupd for which we'll attempt
+  and fail to: block/unblock gfs kernel activity, initiate gfs
+  journal recovery, get recovery-done signals fromt eh kernel.
+ 
+  We don't want to hang groupd event processing by failing to send
+  an ack (stop_done/start_done) back to groupd when it needs one
+  to procede.  In the case where we get a start for a failed node
+  that needs journal recovery, we have a problem because we wait to
+  call group_start_done() until gfs in the kernel to signal that
+  the journal recovery is done.  If we've unmounted gfs isn't there
+  any more to give us this signal and we'll never call start_done.
+ 
+  update: we should be dealing with all these issues correctly now. */
+
+int do_terminate(struct mountgroup *mg)
+{
+	purge_plocks(mg, 0, 1);
+
+	if (mg->withdraw) {
+		log_group(mg, "termination of our withdraw leave");
+		set_sysfs(mg, "withdraw", 1);
+		list_move(&mg->list, &withdrawn_mounts);
+	} else {
+		log_group(mg, "termination of our unmount leave");
+		list_del(&mg->list);
+		free(mg);
+	}
+
+	return 0;
+}
+
+/* The basic rule of withdraw is that we don't want to tell the kernel to drop
+   all locks until we know gfs has been stopped/blocked on all nodes.  They'll
+   be stopped for our leave, we just need to know when they've all arrived
+   there.
+
+   A withdrawing node is very much like a readonly node, differences are
+   that others recover its journal when they remove it from the group,
+   and when it's been removed from the group (gets terminate for its leave),
+   it tells the locally withdrawing gfs to clear out locks. */
+
+int do_withdraw_old(char *table)
+{
+	struct mountgroup *mg;
+	char *name = strstr(table, ":") + 1;
+	int rv;
+
+	if (!cfgd_enable_withdraw) {
+		log_error("withdraw feature not enabled");
+		return 0;
+	}
+
+	mg = find_mg(name);
+	if (!mg) {
+		log_error("do_withdraw no mountgroup %s", name);
+		return -1;
+	}
+
+	rv = run_dmsetup_suspend(mg, mg->mount_args.dev);
+	if (rv) {
+		log_error("do_withdraw %s: dmsetup %s error %d", mg->name,
+			  mg->mount_args.dev, rv);
+		return -1;
+	}
+
+	dmsetup_wait = 1;
+	return 0;
+}
+
+static void do_deliver(int nodeid, char *data, int len)
+{
+	struct mountgroup *mg;
+	struct gdlm_header *hd;
+
+	hd = (struct gdlm_header *) data;
+
+	mg = find_mg(hd->name);
+	if (!mg) {
+		/*
+		log_error("cpg message from %d len %d no group %s",
+			  nodeid, len, hd->name);
+		*/
+		return;
+	}
+
+	hd->version[0]	= le16_to_cpu(hd->version[0]);
+	hd->version[1]	= le16_to_cpu(hd->version[1]);
+	hd->version[2]	= le16_to_cpu(hd->version[2]);
+	hd->type	= le16_to_cpu(hd->type);
+	hd->nodeid	= le32_to_cpu(hd->nodeid);
+	hd->to_nodeid	= le32_to_cpu(hd->to_nodeid);
+
+	/* FIXME: we need to look at how to gracefully fail when we end up
+	   with mixed incompat versions */
+
+	if (hd->version[0] != protocol_active[0]) {
+		log_error("reject message from %d version %u.%u.%u vs %u.%u.%u",
+			  nodeid, hd->version[0], hd->version[1],
+			  hd->version[2], protocol_active[0],
+			  protocol_active[1], protocol_active[2]);
+		return;
+	}
+
+	/* If there are some group messages between a new node being added to
+	   the cpg group and being added to the app group, the new node should
+	   discard them since they're only relevant to the app group. */
+
+	if (!mg->last_callback) {
+		log_group(mg, "discard %s len %d from %d",
+			  msg_name(hd->type), len, nodeid);
+		return;
+	}
+
+	switch (hd->type) {
+	case MSG_JOURNAL:
+		receive_journals(mg, data, len, nodeid);
+		break;
+
+	case MSG_OPTIONS:
+		receive_options(mg, data, len, nodeid);
+		break;
+
+	case MSG_REMOUNT:
+		receive_remount(mg, data, len, nodeid);
+		break;
+
+	case MSG_PLOCK:
+		receive_plock(mg, data, len, nodeid);
+		break;
+
+	case MSG_MOUNT_STATUS:
+		receive_mount_status(mg, data, len, nodeid);
+		break;
+
+	case MSG_RECOVERY_STATUS:
+		receive_recovery_status(mg, data, len, nodeid);
+		break;
+
+	case MSG_RECOVERY_DONE:
+		receive_recovery_done(mg, data, len, nodeid);
+		break;
+
+	case MSG_WITHDRAW:
+		receive_withdraw(mg, data, len, nodeid);
+		break;
+
+	case MSG_PLOCK_OWN:
+		receive_own(mg, data, len, nodeid);
+		break;
+
+	case MSG_PLOCK_DROP:
+		receive_drop(mg, data, len, nodeid);
+		break;
+
+	case MSG_PLOCK_SYNC_LOCK:
+	case MSG_PLOCK_SYNC_WAITER:
+		receive_sync(mg, data, len, nodeid);
+		break;
+
+	default:
+		log_error("unknown message type %d from %d",
+			  hd->type, hd->nodeid);
+	}
+}
+
+static void deliver_cb(cpg_handle_t handle, struct cpg_name *group_name,
+		uint32_t nodeid, uint32_t pid, void *data, int data_len)
+{
+	do_deliver(nodeid, data, data_len);
+}
+
+/* Not sure if purging plocks (driven by confchg) needs to be synchronized with
+   the other recovery steps (driven by libgroup) for a node, don't think so.
+   Is it possible for a node to have been cleared from the members_gone list
+   before this confchg is processed? */
+
+static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name,
+		struct cpg_address *member_list, int member_list_entries,
+		struct cpg_address *left_list, int left_list_entries,
+		struct cpg_address *joined_list, int joined_list_entries)
+{
+	struct mountgroup *mg;
+	int i, nodeid;
+
+	for (i = 0; i < left_list_entries; i++) {
+		nodeid = left_list[i].nodeid;
+		list_for_each_entry(mg, &mountgroups, list) {
+			if (is_member(mg, nodeid) || is_removed(mg, nodeid))
+				purge_plocks(mg, left_list[i].nodeid, 0);
+		}
+	}
+}
+
+static cpg_callbacks_t callbacks = {
+	.cpg_deliver_fn = deliver_cb,
+	.cpg_confchg_fn = confchg_cb,
+};
+
+void update_flow_control_status(void)
+{
+	cpg_flow_control_state_t flow_control_state;
+	cpg_error_t error;
+
+	error = cpg_flow_control_state_get(daemon_handle, &flow_control_state);
+	if (error != CPG_OK) {
+		log_error("cpg_flow_control_state_get %d", error);
+		return;
+	}
+
+	if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) {
+		if (message_flow_control_on == 0) {
+			log_debug("flow control on");
+		}
+		message_flow_control_on = 1;
+	} else {
+		if (message_flow_control_on) {
+			log_debug("flow control off");
+		}
+		message_flow_control_on = 0;
+	}
+}
+
+void process_cpg_old(int ci)
+{
+	cpg_error_t error;
+
+	error = cpg_dispatch(daemon_handle, CPG_DISPATCH_ALL);
+	if (error != CPG_OK) {
+		log_error("cpg_dispatch error %d", error);
+		return;
+	}
+
+	update_flow_control_status();
+}
+
+int setup_cpg_old(void)
+{
+	cpg_error_t error;
+	int fd = 0;
+
+	INIT_LIST_HEAD(&withdrawn_mounts);
+
+	if (cfgd_plock_ownership)
+		memcpy(protocol_active, protocol_v200, sizeof(protocol_v200));
+	else
+		memcpy(protocol_active, protocol_v100, sizeof(protocol_v100));
+
+	error = cpg_initialize(&daemon_handle, &callbacks);
+	if (error != CPG_OK) {
+		log_error("cpg_initialize error %d", error);
+		return -1;
+	}
+
+	cpg_fd_get(daemon_handle, &fd);
+	if (fd < 0) {
+		log_error("cpg_fd_get error %d", error);
+		return -1;
+	}
+
+	memset(&daemon_name, 0, sizeof(daemon_name));
+	strcpy(daemon_name.value, "gfs_controld");
+	daemon_name.length = 12;
+
+ retry:
+	error = cpg_join(daemon_handle, &daemon_name);
+	if (error == CPG_ERR_TRY_AGAIN) {
+		log_debug("setup_cpg cpg_join retry");
+		sleep(1);
+		goto retry;
+	}
+	if (error != CPG_OK) {
+		log_error("cpg_join error %d", error);
+		cpg_finalize(daemon_handle);
+		return -1;
+	}
+
+	log_debug("cpg %d", fd);
+	return fd;
+}
+
diff --git a/group/gfs_controld/cpg-old.h b/group/gfs_controld/cpg-old.h
new file mode 100644
index 0000000..8bbbc5e
--- /dev/null
+++ b/group/gfs_controld/cpg-old.h
@@ -0,0 +1,60 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CPG_OLD_DOT_H__
+#define __CPG_OLD_DOT_H__
+
+#define DO_STOP 1
+#define DO_START 2
+#define DO_FINISH 3
+#define DO_TERMINATE 4
+#define DO_SETID 5
+
+enum {
+
+	MSG_JOURNAL = 1,
+	MSG_OPTIONS,
+	MSG_REMOUNT,
+	MSG_PLOCK,
+	MSG_WITHDRAW,
+	MSG_MOUNT_STATUS,
+	MSG_RECOVERY_STATUS,
+	MSG_RECOVERY_DONE,
+	MSG_PLOCK_OWN,
+	MSG_PLOCK_DROP,
+	MSG_PLOCK_SYNC_LOCK,
+	MSG_PLOCK_SYNC_WAITER,
+};
+
+/* These lengths are part of the wire protocol. */
+
+#define MAX_OPTIONS_LEN		1024
+#define MSG_NAMELEN		255
+
+struct gdlm_header {
+	uint16_t		version[3];
+	uint16_t		type;		   /* MSG_ */
+	uint32_t		nodeid;		 /* sender */
+	uint32_t		to_nodeid;	      /* 0 if to all */
+	char			name[MSG_NAMELEN];
+};
+
+struct save_msg {
+	struct list_head list;
+	int nodeid;
+	int len;
+	int type;
+	char buf[0];
+};
+
+#endif
+
diff --git a/group/gfs_controld/cpg.c b/group/gfs_controld/cpg.c
deleted file mode 100644
index b002591..0000000
--- a/group/gfs_controld/cpg.c
+++ /dev/null
@@ -1,289 +0,0 @@
-/******************************************************************************
-*******************************************************************************
-**
-**  Copyright (C) 2006-2007 Red Hat, Inc.  All rights reserved.
-**
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
-**
-*******************************************************************************
-******************************************************************************/
-
-#include <openais/cpg.h>
-#include "lock_dlm.h"
-
-extern struct list_head mounts;
-extern unsigned int     protocol_active[3];
-static cpg_handle_t	daemon_handle;
-static struct cpg_name	daemon_name;
-int			message_flow_control_on;
-
-void receive_journals(struct mountgroup *mg, char *buf, int len, int from);
-void receive_options(struct mountgroup *mg, char *buf, int len, int from);
-void receive_remount(struct mountgroup *mg, char *buf, int len, int from);
-void receive_plock(struct mountgroup *mg, char *buf, int len, int from);
-void receive_own(struct mountgroup *mg, char *buf, int len, int from);
-void receive_drop(struct mountgroup *mg, char *buf, int len, int from);
-void receive_sync(struct mountgroup *mg, char *buf, int len, int from);
-void receive_withdraw(struct mountgroup *mg, char *buf, int len, int from);
-void receive_mount_status(struct mountgroup *mg, char *buf, int len, int from);
-void receive_recovery_status(struct mountgroup *mg, char *buf, int len,
-			     int from);
-void receive_recovery_done(struct mountgroup *mg, char *buf, int len, int from);
-char *msg_name(int type);
-
-static void do_deliver(int nodeid, char *data, int len)
-{
-	struct mountgroup *mg;
-	struct gdlm_header *hd;
-
-	hd = (struct gdlm_header *) data;
-
-	mg = find_mg(hd->name);
-	if (!mg) {
-		/*
-		log_error("cpg message from %d len %d no group %s",
-			  nodeid, len, hd->name);
-		*/
-		return;
-	}
-
-	hd->version[0]	= le16_to_cpu(hd->version[0]);
-	hd->version[1]	= le16_to_cpu(hd->version[1]);
-	hd->version[2]	= le16_to_cpu(hd->version[2]);
-	hd->type	= le16_to_cpu(hd->type);
-	hd->nodeid	= le32_to_cpu(hd->nodeid);
-	hd->to_nodeid	= le32_to_cpu(hd->to_nodeid);
-
-	/* FIXME: we need to look at how to gracefully fail when we end up
-	   with mixed incompat versions */
-
-	if (hd->version[0] != protocol_active[0]) {
-		log_error("reject message from %d version %u.%u.%u vs %u.%u.%u",
-			  nodeid, hd->version[0], hd->version[1],
-			  hd->version[2], protocol_active[0],
-			  protocol_active[1], protocol_active[2]);
-		return;
-	}
-
-	/* If there are some group messages between a new node being added to
-	   the cpg group and being added to the app group, the new node should
-	   discard them since they're only relevant to the app group. */
-
-	if (!mg->last_callback) {
-		log_group(mg, "discard %s len %d from %d",
-			  msg_name(hd->type), len, nodeid);
-		return;
-	}
-
-	switch (hd->type) {
-	case MSG_JOURNAL: 
-		receive_journals(mg, data, len, nodeid);
-		break;
-
-	case MSG_OPTIONS:
-		receive_options(mg, data, len, nodeid);
-		break;
-
-	case MSG_REMOUNT:
-		receive_remount(mg, data, len, nodeid);
-		break;
-
-	case MSG_PLOCK:
-		receive_plock(mg, data, len, nodeid);
-		break;
-
-	case MSG_MOUNT_STATUS:
-		receive_mount_status(mg, data, len, nodeid);
-		break;
-
-	case MSG_RECOVERY_STATUS:
-		receive_recovery_status(mg, data, len, nodeid);
-		break;
-
-	case MSG_RECOVERY_DONE:
-		receive_recovery_done(mg, data, len, nodeid);
-		break;
-
-	case MSG_WITHDRAW:
-		receive_withdraw(mg, data, len, nodeid);
-		break;
-
-	case MSG_PLOCK_OWN:
-		receive_own(mg, data, len, nodeid);
-		break;
-
-	case MSG_PLOCK_DROP:
-		receive_drop(mg, data, len, nodeid);
-		break;
-
-	case MSG_PLOCK_SYNC_LOCK:
-	case MSG_PLOCK_SYNC_WAITER:
-		receive_sync(mg, data, len, nodeid);
-		break;
-
-	default:
-		log_error("unknown message type %d from %d",
-			  hd->type, hd->nodeid);
-	}
-}
-
-void deliver_cb(cpg_handle_t handle, struct cpg_name *group_name,
-		uint32_t nodeid, uint32_t pid, void *data, int data_len)
-{
-	do_deliver(nodeid, data, data_len);
-}
-
-/* Not sure if purging plocks (driven by confchg) needs to be synchronized with
-   the other recovery steps (driven by libgroup) for a node, don't think so.
-   Is it possible for a node to have been cleared from the members_gone list
-   before this confchg is processed? */
-
-void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name,
-		struct cpg_address *member_list, int member_list_entries,
-		struct cpg_address *left_list, int left_list_entries,
-		struct cpg_address *joined_list, int joined_list_entries)
-{
-	struct mountgroup *mg;
-	int i, nodeid;
-
-	for (i = 0; i < left_list_entries; i++) {
-		nodeid = left_list[i].nodeid;
-		list_for_each_entry(mg, &mounts, list) {
-			if (is_member(mg, nodeid) || is_removed(mg, nodeid))
-				purge_plocks(mg, left_list[i].nodeid, 0);
-		}
-	}
-}
-
-static cpg_callbacks_t callbacks = {
-	.cpg_deliver_fn = deliver_cb,
-	.cpg_confchg_fn = confchg_cb,
-};
-
-void update_flow_control_status(void)
-{
-	cpg_flow_control_state_t flow_control_state;
-	cpg_error_t error;
-	
-	error = cpg_flow_control_state_get(daemon_handle, &flow_control_state);
-	if (error != CPG_OK) {
-		log_error("cpg_flow_control_state_get %d", error);
-		return;
-	}
-
-	if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) {
-		if (message_flow_control_on == 0) {
-			log_debug("flow control on");
-		}
-		message_flow_control_on = 1;
-	} else {
-		if (message_flow_control_on) {
-			log_debug("flow control off");
-		}
-		message_flow_control_on = 0;
-	}
-}
-
-int process_cpg(void)
-{
-	cpg_error_t error;
-
-	error = cpg_dispatch(daemon_handle, CPG_DISPATCH_ALL);
-	if (error != CPG_OK) {
-		log_error("cpg_dispatch error %d", error);
-		return -1;
-	}
-
-	update_flow_control_status();
-
-	return 0;
-}
-
-int setup_cpg(void)
-{
-	cpg_error_t error;
-	int fd = 0;
-
-	error = cpg_initialize(&daemon_handle, &callbacks);
-	if (error != CPG_OK) {
-		log_error("cpg_initialize error %d", error);
-		return -1;
-	}
-
-	cpg_fd_get(daemon_handle, &fd);
-	if (fd < 0) {
-		log_error("cpg_fd_get error %d", error);
-		return -1;
-	}
-
-	memset(&daemon_name, 0, sizeof(daemon_name));
-	strcpy(daemon_name.value, "gfs_controld");
-	daemon_name.length = 12;
-
- retry:
-	error = cpg_join(daemon_handle, &daemon_name);
-	if (error == CPG_ERR_TRY_AGAIN) {
-		log_debug("setup_cpg cpg_join retry");
-		sleep(1);
-		goto retry;
-	}
-	if (error != CPG_OK) {
-		log_error("cpg_join error %d", error);
-		cpg_finalize(daemon_handle);
-		return -1;
-	}
-
-	log_debug("cpg %d", fd);
-	return fd;
-}
-
-static int _send_message(cpg_handle_t h, void *buf, int len, int type)
-{
-	struct iovec iov;
-	cpg_error_t error;
-	int retries = 0;
-
-	iov.iov_base = buf;
-	iov.iov_len = len;
-
- retry:
-	error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
-	if (error == CPG_ERR_TRY_AGAIN) {
-		retries++;
-		usleep(1000);
-		if (!(retries % 100))
-			log_error("cpg_mcast_joined retry %d %s",
-				   retries, msg_name(type));
-		goto retry;
-	}
-	if (error != CPG_OK) {
-		log_error("cpg_mcast_joined error %d handle %llx %s",
-			  error, (unsigned long long)h, msg_name(type));
-		return -1;
-	}
-
-	if (retries)
-		log_debug("cpg_mcast_joined retried %d %s",
-			  retries, msg_name(type));
-
-	return 0;
-}
-
-int send_group_message(struct mountgroup *mg, int len, char *buf)
-{
-	struct gdlm_header *hd = (struct gdlm_header *) buf;
-	int type = hd->type;
-
-	hd->version[0]	= cpu_to_le16(protocol_active[0]);
-	hd->version[1]	= cpu_to_le16(protocol_active[1]);
-	hd->version[2]	= cpu_to_le16(protocol_active[2]);
-	hd->type	= cpu_to_le16(hd->type);
-	hd->nodeid	= cpu_to_le32(hd->nodeid);
-	hd->to_nodeid	= cpu_to_le32(hd->to_nodeid);
-	memcpy(hd->name, mg->name, strlen(mg->name));
-	
-	return _send_message(daemon_handle, buf, len, type);
-}
-
diff --git a/group/gfs_controld/gfs_controld.h b/group/gfs_controld/gfs_controld.h
new file mode 100644
index 0000000..3759cd1
--- /dev/null
+++ b/group/gfs_controld/gfs_controld.h
@@ -0,0 +1,49 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __GFS_CONTROLD_DOT_H__
+#define __GFS_CONTROLD_DOT_H__
+
+/* This defines the interface between gfs_controld and libgfscontrol, and
+   should only be used by libgfscontrol. */
+
+#define GFSC_SOCK_PATH                  "gfsc_sock"
+#define GFSC_QUERY_SOCK_PATH            "gfsc_query_sock"
+
+#define GFSC_MAGIC                      0x6F5C6F5C
+#define GFSC_VERSION                    0x00010001
+
+#define GFSC_CMD_DUMP_DEBUG             1
+#define GFSC_CMD_DUMP_PLOCKS            2
+#define GFSC_CMD_MOUNTGROUP_INFO        3
+#define GFSC_CMD_NODE_INFO              4
+#define GFSC_CMD_MOUNTGROUPS            5
+#define GFSC_CMD_MOUNTGROUP_NODES       6
+#define GFSC_CMD_FS_JOIN		7
+#define GFSC_CMD_FS_REMOUNT		8
+#define GFSC_CMD_FS_MOUNT_DONE		9
+#define GFSC_CMD_FS_LEAVE		10
+
+struct gfsc_header {
+	unsigned int magic;
+	unsigned int version;
+	unsigned int command;
+	unsigned int option;
+	unsigned int len;
+	int data;       /* embedded command-specific data, for convenience */
+	int unused1;
+	int unsued2;
+	char name[GFS_MOUNTGROUP_LEN]; /* no terminating null space */
+};
+
+#endif
+
diff --git a/group/gfs_controld/gfs_daemon.h b/group/gfs_controld/gfs_daemon.h
new file mode 100644
index 0000000..fc82d71
--- /dev/null
+++ b/group/gfs_controld/gfs_daemon.h
@@ -0,0 +1,268 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __GFS_DAEMON_DOT_H__
+#define __GFS_DAEMON_DOT_H__
+
+#include <sys/types.h>
+#include <asm/types.h>
+#include <sys/uio.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/utsname.h>
+#include <sys/poll.h>
+#include <sys/wait.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <netdb.h>
+#include <limits.h>
+#include <unistd.h>
+#include <time.h>
+#include <syslog.h>
+#include <sched.h>
+#include <signal.h>
+#include <sys/time.h>
+#include <dirent.h>
+#include <openais/saAis.h>
+#include <openais/saCkpt.h>
+#include <openais/cpg.h>
+
+#include <linux/dlmconstants.h>
+#include "libgfscontrol.h"
+#include "gfs_controld.h"
+#include "list.h"
+#include "linux_endian.h"
+
+/* TODO: warn if
+   DLM_LOCKSPACE_LEN (from dlmconstants.h) !=
+   GFS_MOUNTGROUP_LEN (from libgfscontrol.h)
+*/
+
+/* Maximum members of a mountgroup, should match CPG_MEMBERS_MAX in
+   openais/cpg.h.  There are no max defines in gfs-kernel for
+   mountgroup members. (FIXME verify gfs-kernel/lock_dlm) */
+
+#define MAX_NODES       128
+
+/* Max string length printed on a line, for debugging/dump output. */
+
+#define MAXLINE         256
+
+extern int daemon_debug_opt;
+extern int daemon_quit;
+extern int poll_ignore_plock;
+extern int plock_fd;
+extern int plock_ci;
+extern struct list_head mountgroups;
+extern int cman_quorate;
+extern int our_nodeid;
+extern char *clustername;
+extern char daemon_debug_buf[256];
+extern char dump_buf[GFSC_DUMP_SIZE];
+extern int dump_point;
+extern int dump_wrap;
+extern char plock_dump_buf[GFSC_DUMP_SIZE];
+extern int plock_dump_len;
+extern int dmsetup_wait;
+
+void daemon_dump_save(void);
+
+#define log_debug(fmt, args...) \
+do { \
+	snprintf(daemon_debug_buf, 255, "%ld " fmt "\n", time(NULL), ##args); \
+	if (daemon_debug_opt) fprintf(stderr, "%s", daemon_debug_buf); \
+	daemon_dump_save(); \
+} while (0)
+
+#define log_group(g, fmt, args...) \
+do { \
+	snprintf(daemon_debug_buf, 255, "%ld %s " fmt "\n", time(NULL), \
+		 (g)->name, ##args); \
+	if (daemon_debug_opt) fprintf(stderr, "%s", daemon_debug_buf); \
+	daemon_dump_save(); \
+} while (0)
+
+#define log_plock(g, fmt, args...) \
+do { \
+	snprintf(daemon_debug_buf, 255, "%ld %s " fmt "\n", time(NULL), \
+		 (g)->name, ##args); \
+	if (cfgd_plock_debug) fprintf(stderr, "%s", daemon_debug_buf); \
+} while (0)
+
+#define log_error(fmt, args...) \
+do { \
+	log_debug(fmt, ##args); \
+	syslog(LOG_ERR, fmt, ##args); \
+} while (0)
+
+#define ASSERT(x) \
+do { \
+	if (!(x)) { \
+		log_error("Assertion failed on line %d of file %s\n" \
+			  "Assertion:  \"%s\"\n", __LINE__, __FILE__, #x); \
+	} \
+} while (0)
+
+struct mountgroup {
+	struct list_head	list;
+	uint32_t		id;
+	struct gfsc_mount_args	mount_args;
+	char			name[GFS_MOUNTGROUP_LEN+1];
+	int			old_group_mode;
+
+	int			mount_client;
+	int			mount_client_fd;
+	int			mount_client_result;
+	int			mount_client_notified;
+	int			mount_client_delay;
+	int			remount_client;
+
+	int			withdraw;
+	int			dmsetup_wait;
+	pid_t			dmsetup_pid;
+
+	/* cpg-old stuff for rhel5/stable2 compat */
+
+	struct list_head	members;
+	struct list_head	members_gone;
+	int			memb_count;
+	int			last_stop;
+	int			last_start;
+	int			last_finish;
+	int			last_callback;
+	int			start_event_nr;
+	int			start_type;
+	int                     group_leave_on_finish;
+	int			init;
+	int			got_our_options;
+	int			got_our_journals;
+	int			delay_send_journals;
+	int			kernel_mount_error;
+	int			kernel_mount_done;
+	int			got_kernel_mount;
+	int			first_mount_pending_stop;
+	int			first_mounter;
+	int			first_mounter_done;
+	int			global_first_recover_done;
+	int			emulate_first_mounter;
+	int			wait_first_done;
+	int			low_nodeid;
+	int			master_nodeid;
+	int			reject_mounts;
+	int			needs_recovery;
+	int			our_jid;
+	int			spectator;
+	int			readonly;
+	int			rw;
+	struct list_head	saved_messages;
+	void			*start2_fn;
+
+	/* cpg-old plock stuff */
+
+	int			save_plocks;
+	struct list_head	plock_resources;
+	uint32_t		associated_ls_id;
+	uint64_t		cp_handle;
+	time_t			last_checkpoint_time;
+	time_t			last_plock_time;
+	struct timeval		drop_resources_last;
+};
+
+/* these need to match the kernel defines of the same name in lm_interface.h */
+
+#define LM_RD_GAVEUP 308
+#define LM_RD_SUCCESS 309
+
+/* config.c */
+void read_ccs(void);
+void read_ccs_nodir(struct mountgroup *mg, char *buf);
+
+/* cpg-old.c */
+int setup_cpg_old(void);
+void process_cpg_old(int ci);
+int send_group_message_old(struct mountgroup *mg, int len, char *buf);
+void save_message_old(struct mountgroup *mg, char *buf, int len, int from,
+		      int type);
+void send_withdraw_old(struct mountgroup *mg);
+void ping_kernel_mount_old(char *table);
+int join_mountgroup_old(int ci, struct gfsc_mount_args *ma);
+int kernel_recovery_done_old(char *table);
+int remount_mountgroup_old(int ci, struct gfsc_mount_args *ma);
+int leave_mountgroup_old(char *table, int mnterr);
+void mount_done_old(struct gfsc_mount_args *ma, int result);
+int do_stop(struct mountgroup *mg);
+int do_finish(struct mountgroup *mg);
+void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids);
+int do_terminate(struct mountgroup *mg);
+int do_withdraw_old(char *table);
+void update_flow_control_status(void);
+
+/* group.c */
+int setup_groupd(void);
+void process_groupd(int ci);
+
+/* main.c */
+int do_read(int fd, void *buf, size_t count);
+int do_write(int fd, void *buf, size_t count);
+void client_dead(int ci);
+int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci));
+int client_fd(int ci);
+void client_ignore(int ci, int fd);
+void client_back(int ci, int fd);
+struct mountgroup *create_mg(char *name);
+struct mountgroup *find_mg(char *name);
+struct mountgroup *find_mg_id(uint32_t id);
+void client_reply_remount(struct mountgroup *mg, int result);
+void client_reply_join(int ci, struct gfsc_mount_args *ma, int result);
+void client_reply_join_full(struct mountgroup *mg, int result);
+void query_lock(void);
+void query_unlock(void);
+void process_connection(int ci);
+
+/* member_cman.c */
+int setup_cman(void);
+void process_cman(int ci);
+
+/* plock.c */
+int setup_plocks(void);
+void process_plocks(int ci);
+int limit_plocks(void);
+void receive_plock(struct mountgroup *mg, char *buf, int len, int from);
+void receive_own(struct mountgroup *mg, char *buf, int len, int from);
+void receive_sync(struct mountgroup *mg, char *buf, int len, int from);
+void receive_drop(struct mountgroup *mg, char *buf, int len, int from);
+void process_saved_plocks(struct mountgroup *mg);
+int unlink_checkpoint(struct mountgroup *mg);
+void store_plocks(struct mountgroup *mg, int nodeid);
+void retrieve_plocks(struct mountgroup *mg);
+void purge_plocks(struct mountgroup *mg, int nodeid, int unmount);
+int fill_plock_dump_buf(struct mountgroup *mg);
+
+/* util.c */
+int we_are_in_fence_domain(void);
+int set_sysfs(struct mountgroup *mg, char *field, int val);
+int read_sysfs_int(struct mountgroup *mg, char *field, int *val_out);
+int run_dmsetup_suspend(struct mountgroup *mg, char *dev);
+void update_dmsetup_wait(void);
+
+#endif
diff --git a/group/gfs_controld/group.c b/group/gfs_controld/group.c
index f797786..116f48e 100644
--- a/group/gfs_controld/group.c
+++ b/group/gfs_controld/group.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -10,24 +10,24 @@
 *******************************************************************************
 ******************************************************************************/
 
-#include "lock_dlm.h"
+#include "gfs_daemon.h"
+#include "cpg-old.h"
+#include "libgroup.h"
+
+#define LOCK_DLM_GROUP_LEVEL    2
+#define LOCK_DLM_GROUP_NAME     "gfs"
 
 /* save all the params from callback functions here because we can't
    do the processing within the callback function itself */
 
 group_handle_t gh;
 static int cb_action;
-static char cb_name[MAX_GROUP_NAME_LEN+1];
+static char cb_name[GFS_MOUNTGROUP_LEN+1];
 static int cb_event_nr;
 static unsigned int cb_id;
 static int cb_type;
 static int cb_member_count;
-static int cb_members[MAX_GROUP_MEMBERS];
-
-int do_stop(struct mountgroup *mg);
-int do_finish(struct mountgroup *mg);
-int do_terminate(struct mountgroup *mg);
-int do_start(struct mountgroup *mg, int type, int count, int *nodeids);
+static int cb_members[MAX_NODES];
 
 
 static void stop_cbfn(group_handle_t h, void *private, char *name)
@@ -42,7 +42,7 @@ static void start_cbfn(group_handle_t h, void *private, char *name,
 	int i;
 
 	cb_action = DO_START;
-	strncpy(cb_name, name, MAX_GROUP_NAME_LEN);
+	strncpy(cb_name, name, GFS_MOUNTGROUP_LEN);
 	cb_event_nr = event_nr;
 	cb_type = type;
 	cb_member_count = member_count;
@@ -55,51 +55,56 @@ static void finish_cbfn(group_handle_t h, void *private, char *name,
 			int event_nr)
 {
 	cb_action = DO_FINISH;
-	strncpy(cb_name, name, MAX_GROUP_NAME_LEN);
+	strncpy(cb_name, name, GFS_MOUNTGROUP_LEN);
 	cb_event_nr = event_nr;
 }
 
 static void terminate_cbfn(group_handle_t h, void *private, char *name)
 {
 	cb_action = DO_TERMINATE;
-	strncpy(cb_name, name, MAX_GROUP_NAME_LEN);
+	strncpy(cb_name, name, GFS_MOUNTGROUP_LEN);
 }
 
 static void setid_cbfn(group_handle_t h, void *private, char *name,
 		       unsigned int id)
 {
 	cb_action = DO_SETID;
-	strncpy(cb_name, name, MAX_GROUP_NAME_LEN);
+	strncpy(cb_name, name, GFS_MOUNTGROUP_LEN);
 	cb_id = id;
 }
 
-static void deliver_cbfn(group_handle_t h, void *private, char *name,
-			 int nodeid, int len, char *buf)
-{
-}
-
 static group_callbacks_t callbacks = {
 	stop_cbfn,
 	start_cbfn,
 	finish_cbfn,
 	terminate_cbfn,
 	setid_cbfn,
-	deliver_cbfn
 };
 
-char *str_members(void)
+static char *str_members(void)
 {
-	static char buf[MAXLINE];
-	int i, len = 0;
-
-	memset(buf, 0, MAXLINE);
-
-	for (i = 0; i < cb_member_count; i++)
-		len += sprintf(buf+len, "%d ", cb_members[i]);
-	return buf;
+	static char str_members_buf[MAXLINE];
+	int i, ret, pos = 0, len = MAXLINE;
+
+	memset(str_members_buf, 0, MAXLINE);
+
+	for (i = 0; i < cb_member_count; i++) {
+		if (i != 0) {
+			ret = snprintf(str_members_buf + pos, len - pos, " ");
+			if (ret >= len - pos)
+				break;
+			pos += ret;
+		}
+		ret = snprintf(str_members_buf + pos, len - pos, "%d",
+			       cb_members[i]);
+		if (ret >= len - pos)
+			break;
+		pos += ret;
+	}
+	return str_members_buf;
 }
 
-int process_groupd(void)
+void process_groupd(int ci)
 {
 	struct mountgroup *mg;
 	int error = 0;
@@ -160,7 +165,6 @@ int process_groupd(void)
 
  out:
 	cb_action = 0;
-	return error;
 }
 
 int setup_groupd(void)
diff --git a/group/gfs_controld/lock_dlm.h b/group/gfs_controld/lock_dlm.h
deleted file mode 100644
index c5bcb80..0000000
--- a/group/gfs_controld/lock_dlm.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/******************************************************************************
-*******************************************************************************
-**
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
-**  
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
-**
-*******************************************************************************
-******************************************************************************/
-
-#ifndef __LOCK_DLM_DOT_H__
-#define __LOCK_DLM_DOT_H__
-
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <stddef.h>
-#include <string.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <time.h>
-#include <syslog.h>
-#include <sched.h>
-#include <limits.h>
-#include <asm/types.h>
-#include <sys/socket.h>
-#include <sys/poll.h>
-#include <sys/un.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <sys/errno.h>
-#include <linux/netlink.h>
-
-#include "list.h"
-#include "linux_endian.h"
-#include "libgroup.h"
-
-#define MAXARGS			16
-#define MAXLINE			256
-#define MAXNAME			255
-#define MAX_CLIENTS		8
-#define MAX_MSGLEN		2048
-#define MAX_OPTIONS_LEN		1024
-#define DUMP_SIZE		(1024 * 1024)
-
-#define LOCK_DLM_GROUP_LEVEL	2
-#define LOCK_DLM_GROUP_NAME	"gfs"
-#define LOCK_DLM_SOCK_PATH	"gfs_controld_sock"
-
-#ifndef TRUE
-#define TRUE (1)
-#endif
-#ifndef FALSE
-#define FALSE (0)
-#endif
-
-enum {
-	DO_STOP = 1,
-	DO_START,
-	DO_FINISH,
-	DO_TERMINATE,
-	DO_SETID,
-	DO_DELIVER,
-};
-
-extern int plock_debug_opt;
-extern int daemon_debug_opt;
-extern char daemon_debug_buf[256];
-extern char dump_buf[DUMP_SIZE];
-extern int dump_point;
-extern int dump_wrap;
-
-extern void daemon_dump_save(void);
-
-#define log_debug(fmt, args...) \
-do { \
-	snprintf(daemon_debug_buf, 255, "%ld " fmt "\n", time(NULL), ##args); \
-	if (daemon_debug_opt) fprintf(stderr, "%s", daemon_debug_buf); \
-	daemon_dump_save(); \
-} while (0)
-
-#define log_group(g, fmt, args...) \
-do { \
-	snprintf(daemon_debug_buf, 255, "%ld %s " fmt "\n", time(NULL), \
-		 (g)->name, ##args); \
-	if (daemon_debug_opt) fprintf(stderr, "%s", daemon_debug_buf); \
-	daemon_dump_save(); \
-} while (0)
-
-#define log_plock(g, fmt, args...) \
-do { \
-	snprintf(daemon_debug_buf, 255, "%ld %s " fmt "\n", time(NULL), \
-		 (g)->name, ##args); \
-	if (plock_debug_opt) fprintf(stderr, "%s", daemon_debug_buf); \
-} while (0)
-
-#define log_error(fmt, args...) \
-do { \
-	log_debug(fmt, ##args); \
-	syslog(LOG_ERR, fmt, ##args); \
-} while (0)
-
-#define ASSERT(x) \
-do { \
-	if (!(x)) { \
-		log_error("Assertion failed on line %d of file %s\n" \
-			  "Assertion:  \"%s\"\n", __LINE__, __FILE__, #x); \
-	} \
-} while (0)
-
-struct mountpoint {
-	struct list_head	list;
-	char			dir[PATH_MAX+1];
-	int			client;
-};
-
-struct mountgroup {
-	struct list_head	list;
-	uint32_t		id;
-	uint32_t		associated_ls_id;
-	struct list_head	members;
-	struct list_head	members_gone;
-	int			memb_count;
-	struct list_head	resources; /* for plocks */
-	struct list_head	mountpoints;
-
-	char			name[MAXNAME+1];
-	char			table[MAXNAME+1];
-	char			type[5];
-	char			options[MAX_OPTIONS_LEN+1];
-	char			dev[PATH_MAX+1];
-
-	int			last_stop;
-	int			last_start;
-	int			last_finish;
-	int			last_callback;
-	int			start_event_nr;
-	int			start_type;
-
-	char			error_msg[128];
-	int			mount_client;
-	int			mount_client_fd;
-	int			mount_client_notified;
-	int			mount_client_delay;
-	int                     group_leave_on_finish;
-	int			remount_client;
-	int			init;
-	int			got_our_options;
-	int			got_our_journals;
-	int			delay_send_journals;
-	int			kernel_mount_error;
-	int			kernel_mount_done;
-	int			got_kernel_mount;
-	int			first_mount_pending_stop;
-	int			first_mounter;
-	int			first_mounter_done;
-	int			global_first_recover_done;
-	int			emulate_first_mounter;
-	int			wait_first_done;
-	int			low_nodeid;
-	int			master_nodeid;
-	int			save_plocks;
-	int			reject_mounts;
-
-	uint64_t		cp_handle;
-	time_t			last_checkpoint_time;
-	time_t			last_plock_time;
-	struct timeval		drop_resources_last;
-
-	int			needs_recovery;
-	int			our_jid;
-	int			spectator;
-	int			readonly;
-	int			rw;
-	int			withdraw;
-	int			dmsetup_wait;
-	pid_t			dmsetup_pid;
-
-	struct list_head	saved_messages;
-	void			*start2_fn;
-};
-
-/* mg_member opts bit field */
-
-enum {
-	MEMB_OPT_RW		= 1,
-	MEMB_OPT_RO		= 2,
-	MEMB_OPT_SPECT		= 4,
-	MEMB_OPT_RECOVER	= 8,
-};
-
-/* these need to match the kernel defines of the same name in
-   linux/fs/gfs2/lm_interface.h */
-
-#define LM_RD_GAVEUP 308
-#define LM_RD_SUCCESS 309
-
-/* mg_member state: local_recovery_status, recovery_status */
-
-enum {
-	RS_NEED_RECOVERY = 1,
-	RS_SUCCESS,
-	RS_GAVEUP,
-	RS_NOFS,
-	RS_READONLY,
-};
-
-struct mg_member {
-	struct list_head	list;
-	int			nodeid;
-	int			jid;
-
-	int			spectator;
-	int			readonly;
-	int			rw;
-	uint32_t		opts;
-
-	int			tell_gfs_to_recover;
-	int			wait_gfs_recover_done;
-	int			gone_event;
-	int			gone_type;
-	int			finished;
-	int			local_recovery_status;
-	int			recovery_status;
-	int			withdrawing;
-	int			needs_journals;
-
-	int			ms_kernel_mount_done;
-	int			ms_first_mounter;
-	int			ms_kernel_mount_error;
-};
-
-enum {
-	MSG_JOURNAL = 1,
-	MSG_OPTIONS,
-	MSG_REMOUNT,
-	MSG_PLOCK,
-	MSG_WITHDRAW,
-	MSG_MOUNT_STATUS,
-	MSG_RECOVERY_STATUS,
-	MSG_RECOVERY_DONE,
-	MSG_PLOCK_OWN,
-	MSG_PLOCK_DROP,
-	MSG_PLOCK_SYNC_LOCK,
-	MSG_PLOCK_SYNC_WAITER,
-};
-
-struct gdlm_header {
-	uint16_t		version[3];
-	uint16_t		type;			/* MSG_ */
-	uint32_t		nodeid;			/* sender */
-	uint32_t		to_nodeid;		/* 0 if to all */
-	char			name[MAXNAME];
-};
-
-struct save_msg {
-	struct list_head list;
-	int nodeid;
-	int len;
-	int type;
-	char buf[0];
-};
-
-int do_read(int fd, void *buf, size_t count);
-int do_write(int fd, void *buf, size_t count);
-struct mountgroup *find_mg(char *name);
-struct mountgroup *find_mg_id(uint32_t id);
-struct mg_member *find_memb_nodeid(struct mountgroup *mg, int nodeid);
-int is_member(struct mountgroup *mg, int nodeid);
-int is_removed(struct mountgroup *mg, int nodeid);
-
-int setup_cman(void);
-int process_cman(void);
-int setup_cpg(void);
-int process_cpg(void);
-int setup_groupd(void);
-int process_groupd(void);
-int setup_plocks(void);
-int process_plocks(void);
-void exit_cman(void);
-
-int do_mount(int ci, char *dir, char *type, char *proto, char *table,
-	     char *options, char *dev, struct mountgroup **mg_ret);
-int do_unmount(int ci, char *dir, int mnterr);
-int do_remount(int ci, char *dir, char *mode);
-int do_withdraw(char *name);
-int kernel_recovery_done(char *name);
-void ping_kernel_mount(char *table);
-void save_message(struct mountgroup *mg, char *buf, int len, int from, int type);
-void got_mount_result(struct mountgroup *mg, int result, int ci, int another);
-
-int client_send(int ci, char *buf, int len);
-int get_sysfs(struct mountgroup *mg, char *field, char *buf, int len);
-
-int send_group_message(struct mountgroup *mg, int len, char *buf);
-void update_flow_control_status(void);
-
-void store_plocks(struct mountgroup *mg, int nodeid);
-void retrieve_plocks(struct mountgroup *mg);
-int dump_plocks(char *name, int fd);
-void process_saved_plocks(struct mountgroup *mg);
-void purge_plocks(struct mountgroup *mg, int nodeid, int unmount);
-int unlink_checkpoint(struct mountgroup *mg);
-void update_dmsetup_wait(void);
-
-#endif
diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c
index dddb74b..7786be1 100644
--- a/group/gfs_controld/main.c
+++ b/group/gfs_controld/main.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -10,81 +10,31 @@
 *******************************************************************************
 ******************************************************************************/
 
-#include "lock_dlm.h"
-#include "ccs.h"
+#include "gfs_daemon.h"
+#include "config.h"
+#include <pthread.h>
+#include <linux/netlink.h>
 
-#define OPTION_STRING			"DPhVwpl:o:t:c:a:"
-#define LOCKFILE_NAME			"/var/run/gfs_controld.pid"
+#define LOCKFILE_NAME	"/var/run/gfs_controld.pid"
+#define CLIENT_NALLOC   32
+#define GROUP_LIBGROUP  2
+#define GROUP_LIBCPG    3
 
-#define DEFAULT_NO_WITHDRAW 0 /* enable withdraw by default */
-#define DEFAULT_NO_PLOCK 0 /* enable plocks by default */
-
-/* max number of plock ops we will cpg-multicast per second */
-#define DEFAULT_PLOCK_RATE_LIMIT 100
-
-/* disable ownership by default because it's a different protocol */
-#define DEFAULT_PLOCK_OWNERSHIP 0
-
-/* max frequency of drop attempts in ms */
-#define DEFAULT_DROP_RESOURCES_TIME 10000 /* 10 sec */
-
-/* max number of resources to drop per time period */
-#define DEFAULT_DROP_RESOURCES_COUNT 10
-
-/* resource not accessed for this many ms before subject to dropping */
-#define DEFAULT_DROP_RESOURCES_AGE 10000 /* 10 sec */
+static int client_maxi;
+static int client_size;
+static struct client *client;
+static struct pollfd *pollfd;
+static int group_mode;
+static pthread_t query_thread;
+static pthread_mutex_t query_mutex;
 
 struct client {
 	int fd;
-	char type[32];
+	void *workfn;
+	void *deadfn;
 	struct mountgroup *mg;
-	int another_mount;
 };
 
-extern struct list_head mounts;
-extern struct list_head withdrawn_mounts;
-extern group_handle_t gh;
-
-int dmsetup_wait;
-
-/* cpg message protocol
-   1.0.0 is initial version
-   2.0.0 is incompatible with 1.0.0 and allows plock ownership */
-unsigned int protocol_v100[3] = {1, 0, 0};
-unsigned int protocol_v200[3] = {2, 0, 0};
-unsigned int protocol_active[3];
-
-/* user configurable */
-int config_no_withdraw;
-int config_no_plock;
-uint32_t config_plock_rate_limit;
-uint32_t config_plock_ownership;
-uint32_t config_drop_resources_time;
-uint32_t config_drop_resources_count;
-uint32_t config_drop_resources_age;
-
-/* command line settings override corresponding cluster.conf settings */
-static int opt_no_withdraw;
-static int opt_no_plock;
-static int opt_plock_rate_limit;
-static int opt_plock_ownership;
-static int opt_drop_resources_time;
-static int opt_drop_resources_count;
-static int opt_drop_resources_age;
-
-static int client_maxi;
-static int client_size = 0;
-static struct client *client = NULL;
-static struct pollfd *pollfd = NULL;
-static int cman_fd;
-static int cpg_fd;
-static int listen_fd;
-static int groupd_fd;
-static int uevent_fd;
-static int plocks_fd;
-static int plocks_ci;
-
-
 int do_read(int fd, void *buf, size_t count)
 {
 	int rv, off = 0;
@@ -123,25 +73,140 @@ int do_write(int fd, void *buf, size_t count)
 	return 0;
 }
 
-#if 0
-static void make_args(char *buf, int *argc, char **argv, char sep)
+static void client_alloc(void)
 {
-	char *p = buf;
 	int i;
 
-	argv[0] = p;
+	if (!client) {
+		client = malloc(CLIENT_NALLOC * sizeof(struct client));
+		pollfd = malloc(CLIENT_NALLOC * sizeof(struct pollfd));
+	} else {
+		client = realloc(client, (client_size + CLIENT_NALLOC) *
+					 sizeof(struct client));
+		pollfd = realloc(pollfd, (client_size + CLIENT_NALLOC) *
+					 sizeof(struct pollfd));
+		if (!pollfd)
+			log_error("can't alloc for pollfd");
+	}
+	if (!client || !pollfd)
+		log_error("can't alloc for client array");
+
+	for (i = client_size; i < client_size + CLIENT_NALLOC; i++) {
+		client[i].workfn = NULL;
+		client[i].deadfn = NULL;
+		client[i].fd = -1;
+		pollfd[i].fd = -1;
+		pollfd[i].revents = 0;
+	}
+	client_size += CLIENT_NALLOC;
+}
 
-	for (i = 1; i < MAXARGS; i++) {
-		p = strchr(buf, sep);
-		if (!p)
-			break;
-		*p = '\0';
-		argv[i] = p + 1;
-		buf = p + 1;
+void client_dead(int ci)
+{
+	close(client[ci].fd);
+	client[ci].workfn = NULL;
+	client[ci].fd = -1;
+	pollfd[ci].fd = -1;
+}
+
+int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci))
+{
+	int i;
+
+	if (!client)
+		client_alloc();
+ again:
+	for (i = 0; i < client_size; i++) {
+		if (client[i].fd == -1) {
+			client[i].workfn = workfn;
+			if (deadfn)
+				client[i].deadfn = deadfn;
+			else
+				client[i].deadfn = client_dead;
+			client[i].fd = fd;
+			pollfd[i].fd = fd;
+			pollfd[i].events = POLLIN;
+			if (i > client_maxi)
+				client_maxi = i;
+			return i;
+		}
 	}
-	*argc = i;
+
+	client_alloc();
+	goto again;
+}
+
+int client_fd(int ci)
+{
+	return client[ci].fd;
 }
-#endif
+
+void client_ignore(int ci, int fd)
+{
+	pollfd[ci].fd = -1;
+	pollfd[ci].events = 0;
+}
+
+void client_back(int ci, int fd)
+{
+	pollfd[ci].fd = fd;
+	pollfd[ci].events = POLLIN;
+}
+
+static void sigterm_handler(int sig)
+{
+	daemon_quit = 1;
+}
+
+struct mountgroup *create_mg(char *name)
+{
+	struct mountgroup *mg;
+
+	mg = malloc(sizeof(struct mountgroup));
+	if (!mg)
+		return NULL;
+	memset(mg, 0, sizeof(struct mountgroup));
+
+	if (group_mode == GROUP_LIBGROUP)
+		mg->old_group_mode = 1;
+
+	INIT_LIST_HEAD(&mg->members);
+	INIT_LIST_HEAD(&mg->members_gone);
+	INIT_LIST_HEAD(&mg->plock_resources);
+	INIT_LIST_HEAD(&mg->saved_messages);
+	mg->init = 1;
+	mg->master_nodeid = -1;
+	mg->low_nodeid = -1;
+
+	strncpy(mg->name, name, GFS_MOUNTGROUP_LEN);
+
+	return mg;
+}
+
+struct mountgroup *find_mg(char *name)
+{
+	struct mountgroup *mg;
+
+	list_for_each_entry(mg, &mountgroups, list) {
+		if ((strlen(mg->name) == strlen(name)) &&
+		    !strncmp(mg->name, name, strlen(name)))
+			return mg;
+	}
+	return NULL;
+}
+
+struct mountgroup *find_mg_id(uint32_t id)
+{
+	struct mountgroup *mg;
+
+	list_for_each_entry(mg, &mountgroups, list) {
+		if (mg->id == id)
+			return mg;
+	}
+	return NULL;
+}
+
+#define MAXARGS 8
 
 static char *get_args(char *buf, int *argc, char **argv, char sep, int want)
 {
@@ -156,7 +221,7 @@ static char *get_args(char *buf, int *argc, char **argv, char sep, int want)
 			break;
 		*p = '\0';
 
-		if (want == i) { 
+		if (want == i) {
 			rp = p + 1;
 			break;
 		}
@@ -173,134 +238,142 @@ static char *get_args(char *buf, int *argc, char **argv, char sep, int want)
 	return rp;
 }
 
-static int client_add(int fd)
+static void process_uevent(int ci)
 {
-	int i;
-
-	while (1) {
-		/* This fails the first time with client_size of zero */
-		for (i = 0; i < client_size; i++) {
-			if (client[i].fd == -1) {
-				client[i].fd = fd;
-				pollfd[i].fd = fd;
-				pollfd[i].events = POLLIN;
-				if (i > client_maxi)
-					client_maxi = i;
-				return i;
-			}
-		}
+	char buf[MAXLINE];
+	char *argv[MAXARGS], *act, *sys;
+	int rv, argc = 0;
+	int lock_module = 0;
 
-		/* We didn't find an empty slot, so allocate more. */
-		client_size += MAX_CLIENTS;
-
-		if (!client) {
-			client = malloc(client_size * sizeof(struct client));
-			pollfd = malloc(client_size * sizeof(struct pollfd));
-		} else {
-			client = realloc(client, client_size *
-						 sizeof(struct client));
-			pollfd = realloc(pollfd, client_size *
-						 sizeof(struct pollfd));
-		}
-		if (!client || !pollfd)
-			log_error("Can't allocate client memory.");
+	memset(buf, 0, sizeof(buf));
+	memset(argv, 0, sizeof(char *) * MAXARGS);
 
-		for (i = client_size - MAX_CLIENTS; i < client_size; i++) {
-			client[i].fd = -1;
-			pollfd[i].fd = -1;
-		}
+ retry_recv:
+	rv = recv(client[ci].fd, &buf, sizeof(buf), 0);
+	if (rv == -1 && rv == EINTR)
+		goto retry_recv;
+	if (rv == -1 && rv == EAGAIN)
+		return;
+	if (rv < 0) {
+		log_error("uevent recv error %d errno %d", rv, errno);
+		return;
 	}
-}
-
-/* I don't think we really want to try to do anything if mount.gfs is killed,
-   because I suspect there are various corner cases where we might not do the
-   right thing.  Even without the corner cases things still don't work out
-   too nicely.  Best to just tell people not to kill a mount or unmount
-   because doing so can leave things (kernel, group, mtab) in inconsistent
-   states that can't be straightened out properly without a reboot. */
 
-static void mount_client_dead(struct mountgroup *mg, int ci)
-{
-	char buf[MAXLINE];
-	int rv;
+	/* first we get the uevent for removing lock module kobject:
+	     "remove@/fs/gfs/bull:x/lock_module"
+	   second is the uevent for removing gfs kobject:
+	     "remove@/fs/gfs/bull:x"
+	*/
 
-	if (ci != mg->mount_client) {
-		log_error("mount client mismatch %d %d", ci, mg->mount_client);
+	if (!strstr(buf, "gfs"))
 		return;
-	}
 
-	/* is checking sysfs really a reliable way of telling whether the
-	   kernel has been mounted or not?  might the kernel mount just not
-	   have reached the sysfs registration yet? */
+	log_debug("uevent: %s", buf);
 
-	memset(buf, 0, sizeof(buf));
+	if (strstr(buf, "lock_module"))
+		lock_module = 1;
 
-	rv = get_sysfs(mg, "id", buf, sizeof(buf));
-	if (!rv) {
-		log_error("mount_client_dead ci %d sysfs id %s", ci, buf);
-#if 0
-		/* finish the mount, although there will be no mtab entry
-		   which will confuse umount causing it to do the kernel
-		   umount but not call umount.gfs */
-		got_mount_result(mg, 0, ci, client[ci].another_mount);
-#endif
-		return;
-	}
+	get_args(buf, &argc, argv, '/', 4);
+	if (argc != 4)
+		log_error("uevent message has %d args", argc);
+	act = argv[0];
+	sys = argv[2];
 
-	log_error("mount_client_dead ci %d no sysfs entry for fs", ci);
+	log_debug("kernel: %s %s", act, argv[3]);
 
-#if 0
-	mp = find_mountpoint_client(mg, ci);
-	if (mp) {
-		list_del(&mp->list);
-		free(mp);
+	if (!strcmp(act, "remove@")) {
+		/* We want to trigger the leave at the very end of the kernel's
+		   unmount process, i.e. at the end of put_super(), so we do the
+		   leave when the second uevent (from the gfs kobj) arrives. */
+
+		if (lock_module)
+			return;
+
+		if (group_mode == GROUP_LIBGROUP)
+			leave_mountgroup_old(argv[3], 0);
+
+	} else if (!strcmp(act, "change@")) {
+		if (!lock_module)
+			return;
+
+		if (group_mode == GROUP_LIBGROUP)
+			kernel_recovery_done_old(argv[3]);
+
+	} else if (!strcmp(act, "offline@")) {
+		if (!lock_module)
+			return;
+
+		if (group_mode == GROUP_LIBGROUP)
+			do_withdraw_old(argv[3]);
+
+	} else {
+		if (!lock_module)
+			return;
+
+		if (group_mode == GROUP_LIBGROUP)
+			ping_kernel_mount_old(argv[3]);
 	}
-	group_leave(gh, mg->name);
-#endif
 }
 
-static void client_dead(int ci)
+static int setup_uevent(void)
 {
-	struct mountgroup *mg;
+	struct sockaddr_nl snl;
+	int s, rv;
 
-	log_debug("client %d fd %d dead", ci, client[ci].fd);
+	s = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
+	if (s < 0) {
+		log_error("uevent netlink socket");
+		return s;
+	}
 
-	/* if the dead mount client is mount.gfs and we've not received
-	   a mount result, then try to put things into a clean state */
-	   
-	mg = client[ci].mg;
-	if (mg && mg->mount_client && mg->mount_client_fd)
-		mount_client_dead(mg, ci);
+	memset(&snl, 0, sizeof(snl));
+	snl.nl_family = AF_NETLINK;
+	snl.nl_pid = getpid();
+	snl.nl_groups = 1;
 
-	close(client[ci].fd);
-	client[ci].fd = -1;
-	pollfd[ci].fd = -1;
-	client[ci].mg = NULL;
-}
+	rv = bind(s, (struct sockaddr *) &snl, sizeof(snl));
+	if (rv < 0) {
+		log_error("uevent bind error %d errno %d", rv, errno);
+		close(s);
+		return rv;
+	}
 
-static void client_ignore(int ci, int fd)
-{
-	pollfd[ci].fd = -1;
-	pollfd[ci].events = 0;
+	return s;
 }
 
-static void client_back(int ci, int fd)
+static void init_header(struct gfsc_header *h, int cmd, char *name, int result,
+			int extra_len)
 {
-	pollfd[ci].fd = fd;
-	pollfd[ci].events = POLLIN;
-}
+	memset(h, 0, sizeof(struct gfsc_header));
 
-int client_send(int ci, char *buf, int len)
-{
-	return do_write(client[ci].fd, buf, len);
+	h->magic = GFSC_MAGIC;
+	h->version = GFSC_VERSION;
+	h->len = sizeof(struct gfsc_header) + extra_len;
+	h->command = cmd;
+	h->data = result;
+
+	if (name)
+		strncpy(h->name, name, GFS_MOUNTGROUP_LEN);
 }
 
-static int do_dump(int fd)
+static void query_dump_debug(int fd)
 {
+	struct gfsc_header h;
+	int extra_len;
 	int len;
 
+	/* in the case of dump_wrap, extra_len will go in two writes,
+	   first the log tail, then the log head */
+	if (dump_wrap)
+		extra_len = GFSC_DUMP_SIZE;
+	else
+		extra_len = dump_point;
+
+	init_header(&h, GFSC_CMD_DUMP_DEBUG, NULL, 0, extra_len);
+	do_write(fd, &h, sizeof(h));
+
 	if (dump_wrap) {
-		len = DUMP_SIZE - dump_point;
+		len = GFSC_DUMP_SIZE - dump_point;
 		do_write(fd, dump_buf + dump_point, len);
 		len = dump_point;
 	} else
@@ -310,137 +383,228 @@ static int do_dump(int fd)
 	dump_buf[dump_point] = '\0';
 
 	do_write(fd, dump_buf, len);
-
-	return 0;
 }
 
-#if 0
-/* mount.gfs sends us a special fd that it will write an error message to
-   if mount(2) fails.  We can monitor this fd for an error message while
-   waiting for the kernel mount outside our main poll loop */
-
-void setup_mount_error_fd(struct mountgroup *mg)
-{
-	struct msghdr msg;
-	struct cmsghdr *cmsg;
-	struct iovec vec;
-	char tmp[CMSG_SPACE(sizeof(int))];
-	int fd, socket = client[mg->mount_client].fd;
-	char ch;
-	ssize_t n;
-
-	memset(&msg, 0, sizeof(msg));
-
-	vec.iov_base = &ch;
-	vec.iov_len = 1;
-	msg.msg_iov = &vec;
-	msg.msg_iovlen = 1;
-	msg.msg_control = tmp;
-	msg.msg_controllen = sizeof(tmp);
-
-	n = recvmsg(socket, &msg, 0);
-	if (n < 0) {
-		log_group(mg, "setup_mount_error_fd recvmsg err %d errno %d",
-			  n, errno);
-		return;
-	}
-	if (n != 1) {
-		log_group(mg, "setup_mount_error_fd recvmsg got %ld", (long)n);
-		return;
+static void query_dump_plocks(int fd, char *name)
+{
+	struct mountgroup *mg;
+	struct gfsc_header h;
+	int rv;
+
+	mg = find_mg(name);
+	if (!mg) {
+		plock_dump_len = 0;
+		rv = -ENOENT;
+	} else {
+		/* writes to plock_dump_buf and sets plock_dump_len */
+		rv = fill_plock_dump_buf(mg);
 	}
 
-	cmsg = CMSG_FIRSTHDR(&msg);
+	init_header(&h, GFSC_CMD_DUMP_PLOCKS, name, rv, plock_dump_len);
+
+	do_write(fd, &h, sizeof(h));
+
+	if (plock_dump_len)
+		do_write(fd, plock_dump_buf, plock_dump_len);
+}
+
+/* combines a header and the data and sends it back to the client in
+   a single do_write() call */
+
+static void do_reply(int fd, int cmd, char *name, int result, void *buf,
+		     int buflen)
+{
+	char *reply;
+	int reply_len;
 
-	if (cmsg->cmsg_type != SCM_RIGHTS) {
-		log_group(mg, "setup_mount_error_fd expected type %d got %d",
-			  SCM_RIGHTS, cmsg->cmsg_type);
+	reply_len = sizeof(struct gfsc_header) + buflen;
+	reply = malloc(reply_len);
+	if (!reply)
 		return;
-	}
+	memset(reply, 0, reply_len);
+
+	init_header((struct gfsc_header *)reply, cmd, name, result, buflen);
 
-	fd = (*(int *)CMSG_DATA(cmsg));
-	mg->mount_error_fd = fd;
+	if (buf && buflen)
+		memcpy(reply + sizeof(struct gfsc_header), buf, buflen);
 
-	fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK);
+	do_write(fd, reply, reply_len);
 
-	log_group(mg, "setup_mount_error_fd got fd %d", fd);
+	free(reply);
 }
-#endif
 
-static int process_client(int ci)
+void client_reply_remount(struct mountgroup *mg, int result)
 {
-	struct mountgroup *mg;
-	char buf[MAXLINE], *argv[MAXARGS], out[MAXLINE];
-	char *cmd = NULL;
-	int argc = 0, rv, fd;
+	struct gfsc_mount_args *ma = &mg->mount_args;
 
-	memset(buf, 0, MAXLINE);
-	memset(out, 0, MAXLINE);
-	memset(argv, 0, sizeof(char *) * MAXARGS);
+	log_group(mg, "remount_reply ci %d result %d",
+		  mg->remount_client, result);
+
+	do_reply(client[mg->remount_client].fd, GFSC_CMD_FS_REMOUNT,
+		 mg->name, result, ma, sizeof(struct gfsc_mount_args));
+
+	mg->remount_client = 0;
+}
+
+void client_reply_join(int ci, struct gfsc_mount_args *ma, int result)
+{
+	char *name = strstr(ma->table, ":") + 1;
+
+	log_debug("join_reply %s ci %d result %d", name, ci, result);
+
+	do_reply(client[ci].fd, GFSC_CMD_FS_JOIN,
+		 name, result, ma, sizeof(struct gfsc_mount_args));
+}
 
-	rv = read(client[ci].fd, buf, MAXLINE);
-	if (!rv) {
-		client_dead(ci);
-		return 0;
+void client_reply_join_full(struct mountgroup *mg, int result)
+{
+	char nodir_str[32];
+
+	if (result)
+		goto out;
+
+	if (mg->our_jid < 0) {
+		snprintf(mg->mount_args.hostdata, PATH_MAX,
+			 "hostdata=id=%u:first=%d",
+			 mg->id, mg->first_mounter);
+	} else {
+		snprintf(mg->mount_args.hostdata, PATH_MAX,
+			 "hostdata=jid=%d:id=%u:first=%d",
+			 mg->our_jid, mg->id, mg->first_mounter);
 	}
+
+	memset(nodir_str, 0, sizeof(nodir_str));
+
+	read_ccs_nodir(mg, nodir_str);
+	if (nodir_str[0])
+		strcat(mg->mount_args.hostdata, nodir_str);
+ out:
+	log_group(mg, "join_full_reply ci %d result %d hostdata %s",
+		  mg->mount_client, result, mg->mount_args.hostdata);
+
+	client_reply_join(mg->mount_client, &mg->mount_args, result);
+}
+
+void process_connection(int ci)
+{
+	struct gfsc_header h;
+	struct gfsc_mount_args empty;
+	struct gfsc_mount_args *ma;
+	char *extra = NULL;
+	int rv, extra_len;
+
+	rv = do_read(client[ci].fd, &h, sizeof(h));
 	if (rv < 0) {
-		log_debug("client %d fd %d read error %d %d", ci,
-			   client[ci].fd, rv, errno);
-		return rv;
+		log_debug("connection %d read error %d", ci, rv);
+		goto out;
 	}
 
-	log_debug("client %d: %s", ci, buf);
+	if (h.magic != GFSC_MAGIC) {
+		log_debug("connection %d magic error %x", ci, h.magic);
+		goto out;
+	}
 
-	get_args(buf, &argc, argv, ' ', 7);
-	cmd = argv[0];
-	rv = 0;
+	if ((h.version & 0xFFFF0000) != (GFSC_VERSION & 0xFFFF0000)) {
+		log_debug("connection %d version error %x", ci, h.version);
+		goto out;
+	}
 
-	if (!strcmp(cmd, "join")) {
-		/* ci, dir (mountpoint), type (gfs/gfs2), proto (lock_dlm),
-		   table (fsname:clustername), extra (rw), dev (/dev/sda1) */
-
-		rv = do_mount(ci, argv[1], argv[2], argv[3], argv[4], argv[5],
-			      argv[6], &mg);
-		fd = client[ci].fd;
-		fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK);
-		if (!rv || rv == -EALREADY) {
-			client[ci].another_mount = rv;
-			client[ci].mg = mg;
-			mg->mount_client_fd = fd;
+	if (h.len > sizeof(h)) {
+		extra_len = h.len - sizeof(h);
+		extra = malloc(extra_len);
+		if (!extra) {
+			log_error("process_connection no mem %d", extra_len);
+			goto out;
 		}
-		goto reply;
-	} else if (!strcmp(cmd, "mount_result")) {
-		got_mount_result(client[ci].mg, atoi(argv[3]), ci,
-				 client[ci].another_mount);
-	} else if (!strcmp(cmd, "leave")) {
-		rv = do_unmount(ci, argv[1], atoi(argv[3]));
-		goto reply;
-
-	} else if (!strcmp(cmd, "remount")) {
-		rv = do_remount(ci, argv[1], argv[3]);
-		goto reply;
-
-	} else if (!strcmp(cmd, "dump")) {
-		do_dump(client[ci].fd);
-		close(client[ci].fd);
-
-	} else if (!strcmp(cmd, "plocks")) {
-		dump_plocks(argv[1], client[ci].fd);
-		client_dead(ci);
+		memset(extra, 0, extra_len);
 
-	} else {
-		rv = -EINVAL;
-		goto reply;
+		rv = do_read(client[ci].fd, extra, extra_len);
+		if (rv < 0) {
+			log_debug("connection %d extra read error %d", ci, rv);
+			goto out;
+		}
 	}
 
-	return rv;
+	ma = (struct gfsc_mount_args *)extra;
 
- reply:
-	sprintf(out, "%d", rv);
-	rv = client_send(ci, out, MAXLINE);
-	return rv;
+	if (!ma) {
+		memset(&empty, 0, sizeof(empty));
+
+		if (h.command == GFSC_CMD_FS_JOIN ||
+		    h.command == GFSC_CMD_FS_REMOUNT) {
+			do_reply(client[ci].fd, h.command, h.name, -EINVAL,
+				 &empty, sizeof(empty));
+		}
+		log_debug("connection %d cmd %d no data", ci, h.command);
+		goto out;
+	}
+
+	switch (h.command) {
+
+	case GFSC_CMD_FS_JOIN:
+		if (group_mode == GROUP_LIBGROUP)
+			join_mountgroup_old(ci, ma);
+		/*
+		else
+			join_mountgroup(ci, ma);
+		*/
+		break;
+
+	case GFSC_CMD_FS_REMOUNT:
+		if (group_mode == GROUP_LIBGROUP)
+			remount_mountgroup_old(ci, ma);
+		/*
+		else
+			remount_mountgroup(ci, ma);
+		*/
+		break;
+
+	case GFSC_CMD_FS_LEAVE:
+		if (group_mode == GROUP_LIBGROUP)
+			leave_mountgroup_old(ma->table, h.data);
+		/*
+		else
+			leave_mountgroup(ma->table, h.data);
+		*/
+		break;
+
+	case GFSC_CMD_FS_MOUNT_DONE:
+		if (group_mode == GROUP_LIBGROUP)
+			mount_done_old(ma, h.data);
+		/*
+		else
+			mount_done(ma, h.data);
+		*/
+		break;
+
+	default:
+		log_error("process_connection %d unknown command %d",
+			  ci, h.command);
+	}
+ out:
+	if (extra)
+		free(extra);
+
+	/* no client_dead(ci) here, since the connection for
+	   join/remount is reused */
 }
 
-static int setup_listen(void)
+static void process_listener(int ci)
+{
+	int fd, i;
+
+	fd = accept(client[ci].fd, NULL, NULL);
+	if (fd < 0) {
+		log_error("process_listener: accept error %d %d", fd, errno);
+		return;
+	}
+
+	i = client_add(fd, process_connection, NULL);
+
+	log_debug("client connection %d fd %d", i, fd);
+}
+
+static int setup_listener(char *sock_path)
 {
 	struct sockaddr_un addr;
 	socklen_t addrlen;
@@ -456,7 +620,7 @@ static int setup_listen(void)
 
 	memset(&addr, 0, sizeof(addr));
 	addr.sun_family = AF_LOCAL;
-	strcpy(&addr.sun_path[1], LOCK_DLM_SOCK_PATH);
+	strcpy(&addr.sun_path[1], sock_path);
 	addrlen = sizeof(sa_family_t) + strlen(addr.sun_path+1) + 1;
 
 	rv = bind(s, (struct sockaddr *) &addr, addrlen);
@@ -472,273 +636,231 @@ static int setup_listen(void)
 		close(s);
 		return rv;
 	}
+	return s;
+}
 
-	log_debug("listen %d", s);
+void query_lock(void)
+{
+	pthread_mutex_lock(&query_mutex);
+}
 
-	return s;
+void query_unlock(void)
+{
+	pthread_mutex_unlock(&query_mutex);
 }
 
-int process_uevent(void)
+/* This is a thread, so we have to be careful, don't call log_ functions.
+   We need a thread to process queries because the main thread may block
+   for long periods. */
+
+static void *process_queries(void *arg)
 {
-	char buf[MAXLINE];
-	char *argv[MAXARGS], *act;
-	int rv, argc = 0;
+	struct gfsc_header h;
+	int s = *((int *)arg);
+	int f, rv;
 
-	memset(buf, 0, sizeof(buf));
-	memset(argv, 0, sizeof(char *) * MAXARGS);
+	for (;;) {
+		f = accept(s, NULL, NULL);
 
-	rv = recv(uevent_fd, &buf, sizeof(buf), 0);
-	if (rv < 0) {
-		log_error("uevent recv error %d errno %d", rv, errno);
-		return -1;
-	}
+		rv = do_read(f, &h, sizeof(h));
+		if (rv < 0) {
+			goto out;
+		}
 
-	if (!strstr(buf, "gfs") || !strstr(buf, "lock_module"))
-		return 0;
+		if (h.magic != GFSC_MAGIC) {
+			goto out;
+		}
 
-	get_args(buf, &argc, argv, '/', 4);
-	if (argc != 4)
-		log_error("uevent message has %d args", argc);
-	act = argv[0];
+		if ((h.version & 0xFFFF0000) != (GFSC_VERSION & 0xFFFF0000)) {
+			goto out;
+		}
 
-	log_debug("kernel: %s %s", act, argv[3]);
+		query_lock();
 
-	if (!strcmp(act, "change@"))
-		kernel_recovery_done(argv[3]);
-	else if (!strcmp(act, "offline@"))
-		do_withdraw(argv[3]);
-	else
-		ping_kernel_mount(argv[3]);
+		switch (h.command) {
+		case GFSC_CMD_DUMP_DEBUG:
+			query_dump_debug(f);
+			break;
+		case GFSC_CMD_DUMP_PLOCKS:
+			query_dump_plocks(f, h.name);
+			break;
+		default:
+			break;
+		}
+		query_unlock();
 
-	return 0;
+ out:
+		close(f);
+	}
 }
 
-int setup_uevent(void)
+static int setup_queries(void)
 {
-	struct sockaddr_nl snl;
-	int s, rv;
+	int rv, s;
 
-	s = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
-	if (s < 0) {
-		log_error("netlink socket error %d errno %d", s, errno);
-		return s;
-	}
+	rv = setup_listener(GFSC_QUERY_SOCK_PATH);
+	if (rv < 0)
+		return rv;
+	s = rv;
 
-	memset(&snl, 0, sizeof(snl));
-	snl.nl_family = AF_NETLINK;
-	snl.nl_pid = getpid();
-	snl.nl_groups = 1;
+	pthread_mutex_init(&query_mutex, NULL);
 
-	rv = bind(s, (struct sockaddr *) &snl, sizeof(snl));
+	rv = pthread_create(&query_thread, NULL, process_queries, &s);
 	if (rv < 0) {
-		log_error("uevent bind error %d errno %d", rv, errno);
+		log_error("can't create query thread");
 		close(s);
 		return rv;
 	}
+	return 0;
+}
 
-	log_debug("uevent %d", s);
-
-	return s;
+static void cluster_dead(int ci)
+{
+	log_error("cluster is down, exiting");
+	exit(1);
 }
 
-int loop(void)
+static int loop(void)
 {
-	int rv, i, f, error, poll_timeout = -1, ignore_plocks_fd = 0;
+	int poll_timeout = -1;
+	int rv, i;
+	void (*workfn) (int ci);
+	void (*deadfn) (int ci);
 
-	rv = listen_fd = setup_listen();
+	rv = setup_queries();
 	if (rv < 0)
 		goto out;
-	client_add(listen_fd);
 
-	rv = cman_fd = setup_cman();
+	rv = setup_listener(GFSC_SOCK_PATH);
 	if (rv < 0)
 		goto out;
-	client_add(cman_fd);
+	client_add(rv, process_listener, NULL);
 
-	rv = cpg_fd = setup_cpg();
+	rv = setup_uevent();
 	if (rv < 0)
 		goto out;
-	client_add(cpg_fd);
+	client_add(rv, process_uevent, NULL);
 
-	rv = groupd_fd = setup_groupd();
+	rv = setup_cman();
 	if (rv < 0)
 		goto out;
-	client_add(groupd_fd);
+	client_add(rv, process_cman, cluster_dead);
 
-	rv = uevent_fd = setup_uevent();
-	if (rv < 0)
-		goto out;
-	client_add(uevent_fd);
+	group_mode = GROUP_LIBCPG;
 
-	rv = plocks_fd = setup_plocks();
-	if (rv < 0)
-		goto out;
-	plocks_ci = client_add(plocks_fd);
+	if (cfgd_groupd_compat) {
+		rv = setup_groupd();
+		if (rv < 0)
+			goto out;
+		client_add(rv, process_groupd, cluster_dead);
 
-	log_debug("setup done");
+		group_mode = GROUP_LIBGROUP;
 
-	for (;;) {
-		rv = poll(pollfd, client_maxi + 1, poll_timeout);
+		if (cfgd_groupd_compat == 2) {
+			/* set_group_mode(); */
+			/* might set group_mode to GROUP_LIBCPG */
+			group_mode = GROUP_LIBGROUP;
+		}
+	}
+
+	if (group_mode == GROUP_LIBCPG) {
+
+		/*
+		 * code in: cpg_new.c
+		 */
+
+		/*
+		rv = setup_cpg_new();
 		if (rv < 0)
-			log_error("poll error %d errno %d", rv, errno);
+			goto out;
+		client_add(rv, process_cpg_new, cluster_dead);
+		*/
 
-		/* client[0] is listening for new connections */
+	} else if (group_mode == GROUP_LIBGROUP) {
 
-		if (pollfd[0].revents & POLLIN) {
-			f = accept(client[0].fd, NULL, NULL);
-			if (f < 0)
-				log_debug("accept error %d %d", f, errno);
-			else
-				client_add(f);
+		/*
+		 * code in: cpg_old.c group.c recover.c plock.c
+		 */
+
+		rv = setup_cpg_old();
+		if (rv < 0)
+			goto out;
+		client_add(rv, process_cpg_old, cluster_dead);
+
+		rv = setup_plocks();
+		if (rv < 0)
+			goto out;
+		plock_fd = rv;
+		plock_ci = client_add(rv, process_plocks, NULL);
+	}
+
+	for (;;) {
+		rv = poll(pollfd, client_maxi + 1, poll_timeout);
+		if (rv == -1 && errno == EINTR) {
+			if (daemon_quit && list_empty(&mountgroups)) {
+				exit(1);
+			}
+			daemon_quit = 0;
+			continue;
 		}
+		if (rv < 0) {
+			log_error("poll errno %d", errno);
+			goto out;
+		}
+
+		/* FIXME: lock/unlock around operations that take a while */
+		query_lock();
 
-		for (i = 1; i <= client_maxi; i++) {
+		for (i = 0; i <= client_maxi; i++) {
 			if (client[i].fd < 0)
 				continue;
-
 			if (pollfd[i].revents & POLLIN) {
-				if (pollfd[i].fd == groupd_fd)
-					process_groupd();
-				else if (pollfd[i].fd == cman_fd)
-					process_cman();
-				else if (pollfd[i].fd == cpg_fd)
-					process_cpg();
-				else if (pollfd[i].fd == uevent_fd)
-					process_uevent();
-				else if (pollfd[i].fd == plocks_fd) {
-					error = process_plocks();
-					if (error == -EBUSY) {
-						client_ignore(plocks_ci,
-							      plocks_fd);
-						ignore_plocks_fd = 1;
-						poll_timeout = 100;
-					}
-				} else
-					process_client(i);
+				workfn = client[i].workfn;
+				workfn(i);
 			}
-
-			if (pollfd[i].revents & (POLLHUP | POLLERR | POLLNVAL)) {
-				if (pollfd[i].fd == cman_fd) {
-					log_error("cman connection died");
-					exit_cman();
-				} else if (pollfd[i].fd == groupd_fd) {
-					log_error("groupd connection died");
-					exit_cman();
-				} else if (pollfd[i].fd == cpg_fd) {
-					log_error("cpg connection died");
-					exit_cman();
-				}
-				client_dead(i);
+			if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) {
+				deadfn = client[i].deadfn;
+				deadfn(i);
 			}
+		}
 
-			/* check if our plock rate limit has expired so we
-			   can start taking more local plock requests again */
+		poll_timeout = -1;
 
-			if (ignore_plocks_fd) {
-				error = process_plocks();
-				if (error != -EBUSY) {
-					client_back(plocks_ci, plocks_fd);
-					ignore_plocks_fd = 0;
-					poll_timeout = -1;
-				}
+#if 0
+		if (poll_dlm) {
+			/* only happens for GROUP_LIBCPG */
+			process_mountgroup_changes();
+			poll_timeout = 1000;
+		}
+#endif
+
+		if (poll_ignore_plock) {
+			/* only happens for GROUP_LIBGROUP */
+			if (!limit_plocks()) {
+				poll_ignore_plock = 0;
+				client_back(plock_ci, plock_fd);
 			}
+			poll_timeout = 1000;
+		}
 
+		if (dmsetup_wait) {
+			update_dmsetup_wait();
 			if (dmsetup_wait) {
-				update_dmsetup_wait();
-				if (dmsetup_wait) {
-					if (poll_timeout == -1)
-						poll_timeout = 1000;
-				} else {
-					if (poll_timeout == 1000)
-						poll_timeout = -1;
-				}
+				if (poll_timeout == -1)
+					poll_timeout = 1000;
+			} else {
+				if (poll_timeout == 1000)
+					poll_timeout = -1;
 			}
 		}
+
+		query_unlock();
 	}
 	rv = 0;
  out:
 	return rv;
 }
 
-#define PLOCK_RATE_LIMIT_PATH "/cluster/gfs_controld/@plock_rate_limit"
-#define PLOCK_OWNERSHIP_PATH "/cluster/gfs_controld/@plock_ownership"
-#define DROP_RESOURCES_TIME_PATH "/cluster/gfs_controld/@drop_resources_time"
-#define DROP_RESOURCES_COUNT_PATH "/cluster/gfs_controld/@drop_resources_count"
-#define DROP_RESOURCES_AGE_PATH "/cluster/gfs_controld/@drop_resources_age"
-
-static void set_ccs_config(void)
-{
-	char path[PATH_MAX], *str;
-	int i = 0, cd, error;
-
-	while ((cd = ccs_connect()) < 0) {
-		sleep(1);
-		if (++i > 9 && !(i % 10))
-			log_error("connect to ccs error %d, "
-				  "check ccsd or cluster status", cd);
-	}
-
-	memset(path, 0, PATH_MAX);
-	snprintf(path, PATH_MAX, "%s", PLOCK_RATE_LIMIT_PATH);
-	str = NULL;
-
-	error = ccs_get(cd, path, &str);
-	if (!error) {
-		if (!opt_plock_rate_limit)
-			config_plock_rate_limit = atoi(str);
-	}
-	if (str)
-		free(str);
-
-	memset(path, 0, PATH_MAX);
-	snprintf(path, PATH_MAX, "%s", PLOCK_OWNERSHIP_PATH);
-	str = NULL;
-
-	error = ccs_get(cd, path, &str);
-	if (!error) {
-		if (!opt_plock_ownership)
-			config_plock_ownership = atoi(str);
-	}
-	if (str)
-		free(str);
-
-	memset(path, 0, PATH_MAX);
-	snprintf(path, PATH_MAX, "%s", DROP_RESOURCES_TIME_PATH);
-	str = NULL;
-
-	error = ccs_get(cd, path, &str);
-	if (!error) {
-		if (!opt_drop_resources_time)
-			config_drop_resources_time = atoi(str);
-	}
-	if (str)
-		free(str);
-
-	memset(path, 0, PATH_MAX);
-	snprintf(path, PATH_MAX, "%s", DROP_RESOURCES_COUNT_PATH);
-	str = NULL;
-
-	error = ccs_get(cd, path, &str);
-	if (!error) {
-		if (!opt_drop_resources_count)
-			config_drop_resources_count = atoi(str);
-	}
-	if (str)
-		free(str);
-
-	memset(path, 0, PATH_MAX);
-	snprintf(path, PATH_MAX, "%s", DROP_RESOURCES_AGE_PATH);
-	str = NULL;
-
-	error = ccs_get(cd, path, &str);
-	if (!error) {
-		if (!opt_drop_resources_age)
-			config_drop_resources_age = atoi(str);
-	}
-	if (str)
-		free(str);
-}
-
 static void lockfile(void)
 {
 	int fd, error;
@@ -789,25 +911,34 @@ static void print_usage(void)
 	printf("\n");
 	printf("Options:\n");
 	printf("\n");
-	printf("  -D	       Enable debugging code and don't fork\n");
-	printf("  -P	       Enable plock debugging\n");
-	printf("  -w	       Disable withdraw\n");
-	printf("  -p	       Disable plocks\n");
+	printf("  -D           Enable debugging code and don't fork\n");
+	printf("  -g <num>     groupd compatibility, 0 off, 1 on\n");
+	printf("               on: use libgroup, compat with cluster2/stable2/rhel5\n");
+	printf("               off: use libcpg, no backward compatability\n");
+	printf("               Default is %d\n", DEFAULT_GROUPD_COMPAT);
+	printf("  -w <num>     Enable (1) or disable (0) withdraw\n");
+	printf("               Default is %d\n", DEFAULT_ENABLE_WITHDRAW);
+	printf("  -p <num>     Enable (1) or disable (0) plock code\n");
+	printf("               Default is %d\n", DEFAULT_ENABLE_PLOCK);
+	printf("  -P           Enable plock debugging\n");
+
 	printf("  -l <limit>   Limit the rate of plock operations\n");
-	printf("	       Default is %d, set to 0 for no limit\n", DEFAULT_PLOCK_RATE_LIMIT);
-	printf("  -o <n>       plock ownership, 1 enable, 0 disable\n");
+	printf("               Default is %d, set to 0 for no limit\n", DEFAULT_PLOCK_RATE_LIMIT);
+	printf("  -o <n>       Enable (1) or disable (0) plock ownership\n");
 	printf("               Default is %d\n", DEFAULT_PLOCK_OWNERSHIP);
-	printf("  -t <ms>      drop resources time (milliseconds)\n");
+	printf("  -t <ms>      plock ownership drop resources time (milliseconds)\n");
 	printf("               Default is %u\n", DEFAULT_DROP_RESOURCES_TIME);
-	printf("  -c <num>     drop resources count\n");
+	printf("  -c <num>     plock ownership drop resources count\n");
 	printf("               Default is %u\n", DEFAULT_DROP_RESOURCES_COUNT);
-	printf("  -a <ms>      drop resources age (milliseconds)\n");
+	printf("  -a <ms>      plock ownership drop resources age (milliseconds)\n");
 	printf("               Default is %u\n", DEFAULT_DROP_RESOURCES_AGE);
-	printf("  -h	       Print this help, then exit\n");
-	printf("  -V	       Print program version information, then exit\n");
+	printf("  -h           Print this help, then exit\n");
+	printf("  -V           Print program version information, then exit\n");
 }
 
-static void decode_arguments(int argc, char **argv)
+#define OPTION_STRING "DKg:w:f:q:d:p:Pl:o:t:c:a:hV"
+
+static void read_arguments(int argc, char **argv)
 {
 	int cont = 1;
 	int optchar;
@@ -821,43 +952,49 @@ static void decode_arguments(int argc, char **argv)
 			daemon_debug_opt = 1;
 			break;
 
-		case 'P':
-			plock_debug_opt = 1;
+		case 'g':
+			optd_groupd_compat = 1;
+			cfgd_groupd_compat = atoi(optarg);
 			break;
 
 		case 'w':
-			config_no_withdraw = 1;
-			opt_no_withdraw = 1;
+			optd_enable_withdraw = 1;
+			cfgd_enable_withdraw = atoi(optarg);
 			break;
 
 		case 'p':
-			config_no_plock = 1;
-			opt_no_plock = 1;
+			optd_enable_plock = 1;
+			cfgd_enable_plock = atoi(optarg);
+			break;
+
+		case 'P':
+			optd_plock_debug = 1;
+			cfgd_plock_debug = 1;
 			break;
 
 		case 'l':
-			config_plock_rate_limit = atoi(optarg);
-			opt_plock_rate_limit = 1;
+			optd_plock_rate_limit = 1;
+			cfgd_plock_rate_limit = atoi(optarg);
 			break;
 
 		case 'o':
-			config_plock_ownership = atoi(optarg);
-			opt_plock_ownership = 1;
+			optd_plock_ownership = 1;
+			cfgd_plock_ownership = atoi(optarg);
 			break;
 
 		case 't':
-			config_drop_resources_time = atoi(optarg);
-			opt_drop_resources_time = 1;
+			optd_drop_resources_time = 1;
+			cfgd_drop_resources_time = atoi(optarg);
 			break;
 
 		case 'c':
-			config_drop_resources_count = atoi(optarg);
-			opt_drop_resources_count = 1;
+			optd_drop_resources_count = 1;
+			cfgd_drop_resources_count = atoi(optarg);
 			break;
 
 		case 'a':
-			config_drop_resources_age = atoi(optarg);
-			opt_drop_resources_age = 1;
+			optd_drop_resources_age = 1;
+			cfgd_drop_resources_age = atoi(optarg);
 			break;
 
 		case 'h':
@@ -890,7 +1027,7 @@ static void decode_arguments(int argc, char **argv)
 	}
 }
 
-void set_oom_adj(int val)
+static void set_oom_adj(int val)
 {
 	FILE *fp;
 
@@ -902,7 +1039,7 @@ void set_oom_adj(int val)
 	fclose(fp);
 }
 
-void set_scheduler(void)
+static void set_scheduler(void)
 {
 	struct sched_param sched_param;
 	int rv;
@@ -922,18 +1059,9 @@ void set_scheduler(void)
 
 int main(int argc, char **argv)
 {
-	INIT_LIST_HEAD(&mounts);
-	INIT_LIST_HEAD(&withdrawn_mounts);
-
-	config_no_withdraw = DEFAULT_NO_WITHDRAW;
-	config_no_plock = DEFAULT_NO_PLOCK;
-	config_plock_rate_limit = DEFAULT_PLOCK_RATE_LIMIT;
-	config_plock_ownership = DEFAULT_PLOCK_OWNERSHIP;
-	config_drop_resources_time = DEFAULT_DROP_RESOURCES_TIME;
-	config_drop_resources_count = DEFAULT_DROP_RESOURCES_COUNT;
-	config_drop_resources_age = DEFAULT_DROP_RESOURCES_AGE;
+	INIT_LIST_HEAD(&mountgroups);
 
-	decode_arguments(argc, argv);
+	read_arguments(argc, argv);
 
 	lockfile();
 
@@ -944,24 +1072,9 @@ int main(int argc, char **argv)
 		}
 	}
 	openlog("gfs_controld", LOG_PID, LOG_DAEMON);
+	signal(SIGTERM, sigterm_handler);
 
-	/* ccs settings override the defaults, but not the command line */
-	set_ccs_config();
-
-	if (config_plock_ownership)
-		memcpy(protocol_active, protocol_v200, sizeof(protocol_v200));
-	else
-		memcpy(protocol_active, protocol_v100, sizeof(protocol_v100));
-
-	log_debug("config_no_withdraw %d", config_no_withdraw);
-	log_debug("config_no_plock %d", config_no_plock);
-	log_debug("config_plock_rate_limit %u", config_plock_rate_limit);
-	log_debug("config_plock_ownership %u", config_plock_ownership);
-	log_debug("config_drop_resources_time %u", config_drop_resources_time);
-	log_debug("config_drop_resources_count %u", config_drop_resources_count);
-	log_debug("config_drop_resources_age %u", config_drop_resources_age);
-	log_debug("protocol %u.%u.%u", protocol_active[0], protocol_active[1],
-		  protocol_active[2]);
+	read_ccs();
 
 	set_scheduler();
 	set_oom_adj(-16);
@@ -978,17 +1091,27 @@ void daemon_dump_save(void)
 	for (i = 0; i < len; i++) {
 		dump_buf[dump_point++] = daemon_debug_buf[i];
 
-		if (dump_point == DUMP_SIZE) {
+		if (dump_point == GFSC_DUMP_SIZE) {
 			dump_point = 0;
 			dump_wrap = 1;
 		}
 	}
 }
 
-int plock_debug_opt;
 int daemon_debug_opt;
+int daemon_quit;
+int poll_ignore_plock;
+int plock_fd;
+int plock_ci;
+struct list_head mountgroups;
+int cman_quorate;
+int our_nodeid;
+char *clustername;
 char daemon_debug_buf[256];
-char dump_buf[DUMP_SIZE];
+char dump_buf[GFSC_DUMP_SIZE];
 int dump_point;
 int dump_wrap;
+char plock_dump_buf[GFSC_DUMP_SIZE];
+int plock_dump_len;
+int dmsetup_wait;
 
diff --git a/group/gfs_controld/member_cman.c b/group/gfs_controld/member_cman.c
index 4357e35..4e6fbca 100644
--- a/group/gfs_controld/member_cman.c
+++ b/group/gfs_controld/member_cman.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -10,29 +10,16 @@
 *******************************************************************************
 ******************************************************************************/
 
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <arpa/inet.h>
+#include "gfs_daemon.h"
 #include <libcman.h>
 
-#include "lock_dlm.h"
-
-int			our_nodeid;
-char *			clustername;
-cman_cluster_t		cluster;
-static cman_handle_t	ch;
-extern struct list_head mounts;
-
+static cman_handle_t ch;
+static cman_cluster_t cluster;
 
 static void cman_callback(cman_handle_t h, void *private, int reason, int arg)
 {
 	if (reason == CMAN_REASON_TRY_SHUTDOWN) {
-		if (list_empty(&mounts))
+		if (list_empty(&mountgroups))
 			cman_replyto_shutdown(ch, 1);
 		else {
 			log_debug("no to cman shutdown");
@@ -41,13 +28,13 @@ static void cman_callback(cman_handle_t h, void *private, int reason, int arg)
 	}
 }
 
-void exit_cman(void)
+static void exit_cman(void)
 {
 	log_error("cluster is down, exiting");
 	exit(1);
 }
 
-int process_cman(void)
+void process_cman(int ci)
 {
 	int rv;
 
@@ -55,8 +42,6 @@ int process_cman(void)
 
 	if (rv == -1 && errno == EHOSTDOWN)
 		exit_cman();
-
-	return 0;
 }
 
 int setup_cman(void)
diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c
index 1b9bbe2..037c110 100644
--- a/group/gfs_controld/plock.c
+++ b/group/gfs_controld/plock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -10,34 +10,13 @@
 *******************************************************************************
 ******************************************************************************/
 
-#include <sys/types.h>
-#include <asm/types.h>
-#include <sys/uio.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/stat.h>
-#include <sys/utsname.h>
-#include <sys/time.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <net/if.h>
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <fcntl.h>
-#include <netdb.h>
-#include <limits.h>
-#include <unistd.h>
-#include <dirent.h>
-#include <openais/saAis.h>
-#include <openais/saCkpt.h>
-#include <linux/dlm_plock.h>
+/* gfs_controld only handles plocks in rhel5/stable2 compat mode */
+
+#include "gfs_daemon.h"
+#include "cpg-old.h"
+#include "config.h"
 
-#include "lock_dlm.h"
+#include <linux/dlm_plock.h>
 
 #define PROC_MISC               "/proc/misc"
 #define PROC_DEVICES            "/proc/devices"
@@ -45,19 +24,6 @@
 #define CONTROL_DIR             "/dev/misc"
 #define CONTROL_NAME            "dlm_plock"
 
-extern struct list_head mounts;
-extern int our_nodeid;
-extern int message_flow_control_on;
-
-/* user configurable */
-extern int config_no_plock;
-extern uint32_t config_plock_rate_limit;
-extern uint32_t config_plock_ownership;
-extern uint32_t config_drop_resources_time;
-extern uint32_t config_drop_resources_count;
-extern uint32_t config_drop_resources_age;
-
-static int plocks_online = 0;
 static uint32_t plock_read_count;
 static uint32_t plock_recv_count;
 static uint32_t plock_rate_delays;
@@ -73,6 +39,8 @@ static char section_buf[1024 * 1024];
 static uint32_t section_len;
 static int need_fsid_translation = 0;
 
+extern int message_flow_control_on;
+
 struct pack_plock {
 	uint64_t start;
 	uint64_t end;
@@ -332,16 +300,15 @@ int setup_plocks(void)
 	gettimeofday(&plock_recv_time, NULL);
 	gettimeofday(&plock_rate_last, NULL);
 
-	if (config_no_plock)
-		goto control;
-
 	err = saCkptInitialize(&ckpt_handle, &callbacks, &version);
-	if (err == SA_AIS_OK)
-		plocks_online = 1;
-	else
-		log_error("ckpt init error %d - plocks unavailable", err);
+	if (err != SA_AIS_OK) {
+		log_error("ckpt init error %d", err);
+		cfgd_enable_plock = 0;
+
+		/* still try to open and read the control device so that we can
+		   send ENOSYS back to the kernel if it tries to do a plock */
+	}
 
- control:
 	need_fsid_translation = 1;
 
 	rv = open_control(CONTROL_NAME, DLM_PLOCK_MISC_NAME);
@@ -364,7 +331,7 @@ int setup_plocks(void)
 	log_debug("plock need_fsid_translation %d", need_fsid_translation);
 	log_debug("plock cpg message size: %u bytes",
 		  (unsigned int) (sizeof(struct gdlm_header) +
-		                  sizeof(struct dlm_plock_info)));
+				  sizeof(struct dlm_plock_info)));
 
 	return control_fd;
 }
@@ -392,7 +359,7 @@ static struct resource *search_resource(struct mountgroup *mg, uint64_t number)
 {
 	struct resource *r;
 
-	list_for_each_entry(r, &mg->resources, list) {
+	list_for_each_entry(r, &mg->plock_resources, list) {
 		if (r->number == number)
 			return r;
 	}
@@ -427,12 +394,12 @@ static int find_resource(struct mountgroup *mg, uint64_t number, int create,
 	INIT_LIST_HEAD(&r->waiters);
 	INIT_LIST_HEAD(&r->pending);
 
-	if (config_plock_ownership)
+	if (cfgd_plock_ownership)
 		r->owner = -1;
 	else
 		r->owner = 0;
 
-	list_add_tail(&r->list, &mg->resources);
+	list_add_tail(&r->list, &mg->plock_resources);
  out:
 	if (r)
 		gettimeofday(&r->last_access, NULL);
@@ -443,7 +410,7 @@ static int find_resource(struct mountgroup *mg, uint64_t number, int create,
 static void put_resource(struct resource *r)
 {
 	/* with ownership, resources are only freed via drop messages */
-	if (config_plock_ownership)
+	if (cfgd_plock_ownership)
 		return;
 
 	if (list_empty(&r->locks) && list_empty(&r->waiters)) {
@@ -456,8 +423,8 @@ static inline int ranges_overlap(uint64_t start1, uint64_t end1,
 				 uint64_t start2, uint64_t end2)
 {
 	if (end1 < start2 || start1 > end2)
-		return FALSE;
-	return TRUE;
+		return 0;
+	return 1;
 }
 
 /**
@@ -630,7 +597,7 @@ static int lock_case1(struct posix_lock *po, struct resource *r,
    1. add new lock for front fragment, orig mode
    2. add new lock for back fragment, orig mode
    3. convert RE to RN range and mode */
-			 
+
 static int lock_case2(struct posix_lock *po, struct resource *r,
 		      struct dlm_plock_info *in)
 
@@ -958,11 +925,11 @@ static void _receive_plock(struct mountgroup *mg, char *buf, int len, int from)
 		return;
 	}
 
-	create = !config_plock_ownership;
+	create = !cfgd_plock_ownership;
 
 	rv = find_resource(mg, info.number, create, &r);
 
-	if (rv && config_plock_ownership) {
+	if (rv && cfgd_plock_ownership) {
 		/* There must have been a race with a drop, so we need to
 		   ignore this plock op which will be resent.  If we're the one
 		   who sent the plock, we need to send_own() and put it on the
@@ -1043,7 +1010,7 @@ static void _receive_plock(struct mountgroup *mg, char *buf, int len, int from)
 void receive_plock(struct mountgroup *mg, char *buf, int len, int from)
 {
 	if (mg->save_plocks) {
-		save_message(mg, buf, len, from, MSG_PLOCK);
+		save_message_old(mg, buf, len, from, MSG_PLOCK);
 		return;
 	}
 
@@ -1079,7 +1046,7 @@ static int send_struct_info(struct mountgroup *mg, struct dlm_plock_info *in,
 
 	memcpy(buf + sizeof(struct gdlm_header), in, sizeof(*in));
 
-	rv = send_group_message(mg, len, buf);
+	rv = send_group_message_old(mg, len, buf);
 
 	free(buf);
  out:
@@ -1329,7 +1296,7 @@ static void _receive_own(struct mountgroup *mg, char *buf, int len, int from)
 void receive_own(struct mountgroup *mg, char *buf, int len, int from)
 {
 	if (mg->save_plocks) {
-		save_message(mg, buf, len, from, MSG_PLOCK_OWN);
+		save_message_old(mg, buf, len, from, MSG_PLOCK_OWN);
 		return;
 	}
 
@@ -1368,7 +1335,7 @@ static void clear_syncing_flag(struct resource *r, struct dlm_plock_info *in)
 	}
 
 	log_error("clear_syncing %llx no match %s %llx-%llx %d/%u/%llx",
-		  (unsigned long long)r->number, in->ex ? "WR" : "RD", 
+		  (unsigned long long)r->number, in->ex ? "WR" : "RD",
 		  (unsigned long long)in->start, (unsigned long long)in->end,
 		  in->nodeid, in->pid, (unsigned long long)in->owner);
 }
@@ -1401,7 +1368,7 @@ static void _receive_sync(struct mountgroup *mg, char *buf, int len, int from)
 	}
 
 	if (hd->type == MSG_PLOCK_SYNC_LOCK)
-		add_lock(r, info.nodeid, info.owner, info.pid, !info.ex, 
+		add_lock(r, info.nodeid, info.owner, info.pid, !info.ex,
 			 info.start, info.end);
 	else if (hd->type == MSG_PLOCK_SYNC_WAITER)
 		add_waiter(mg, r, &info);
@@ -1412,7 +1379,7 @@ void receive_sync(struct mountgroup *mg, char *buf, int len, int from)
 	struct gdlm_header *hd = (struct gdlm_header *) buf;
 
 	if (mg->save_plocks) {
-		save_message(mg, buf, len, from, hd->type);
+		save_message_old(mg, buf, len, from, hd->type);
 		return;
 	}
 
@@ -1473,7 +1440,7 @@ static void _receive_drop(struct mountgroup *mg, char *buf, int len, int from)
 void receive_drop(struct mountgroup *mg, char *buf, int len, int from)
 {
 	if (mg->save_plocks) {
-		save_message(mg, buf, len, from, MSG_PLOCK_DROP);
+		save_message_old(mg, buf, len, from, MSG_PLOCK_DROP);
 		return;
 	}
 
@@ -1496,13 +1463,13 @@ static int drop_resources(struct mountgroup *mg)
 
 	/* try to drop the oldest, unused resources */
 
-	list_for_each_entry_reverse(r, &mg->resources, list) {
-		if (count >= config_drop_resources_count)
+	list_for_each_entry_reverse(r, &mg->plock_resources, list) {
+		if (count >= cfgd_drop_resources_count)
 			break;
 		if (r->owner && r->owner != our_nodeid)
 			continue;
 		if (time_diff_ms(&r->last_access, &now) <
-		    config_drop_resources_age)
+		    cfgd_drop_resources_age)
 			continue;
 
 		if (list_empty(&r->locks) && list_empty(&r->waiters)) {
@@ -1617,7 +1584,7 @@ static uint32_t ls_to_mg_id(uint32_t fsid)
 	int do_set = 1;
 
  retry:
-	list_for_each_entry(mg, &mounts, list) {
+	list_for_each_entry(mg, &mountgroups, list) {
 		if (mg->associated_ls_id == fsid)
 			return mg->id;
 	}
@@ -1631,36 +1598,55 @@ static uint32_t ls_to_mg_id(uint32_t fsid)
 	return fsid;
 }
 
-int process_plocks(void)
+int limit_plocks(void)
 {
-	struct mountgroup *mg;
-	struct resource *r;
-	struct dlm_plock_info info;
 	struct timeval now;
-	uint64_t usec;
-	int rv;
 
 	/* Don't send more messages while the cpg message queue is backed up */
 
 	if (message_flow_control_on) {
 		update_flow_control_status();
 		if (message_flow_control_on)
-			return -EBUSY;
+			return 1;
 	}
 
+	if (!cfgd_plock_rate_limit || !plock_read_count)
+		return 0;
+
 	gettimeofday(&now, NULL);
 
-	/* Every N ops we check how long it's taken to do those N ops.
-	   If it's less than 1000 ms, we don't take any more. */
+	/* Every time a plock op is read from the kernel, we increment
+	   plock_read_count.  After every cfgd_plock_rate_limit (N) reads,
+	   we check the time it's taken to do those N; if the time is less than
+	   a second, then we delay reading any more until a second is up.
+	   This way we read a max of N ops from the kernel every second. */
 
-	if (config_plock_rate_limit && plock_read_count &&
-	    !(plock_read_count % config_plock_rate_limit)) {
+	if (!(plock_read_count % cfgd_plock_rate_limit)) {
 		if (time_diff_ms(&plock_rate_last, &now) < 1000) {
 			plock_rate_delays++;
-			return -EBUSY;
+			return 2;
 		}
 		plock_rate_last = now;
 	}
+	return 0;
+}
+
+void process_plocks(int ci)
+{
+	struct mountgroup *mg;
+	struct resource *r;
+	struct dlm_plock_info info;
+	struct timeval now;
+	uint64_t usec;
+	int rv;
+
+	if (limit_plocks()) {
+		poll_ignore_plock = 1;
+		client_ignore(plock_ci, plock_fd);
+		return;
+	}
+
+	gettimeofday(&now, NULL);
 
 	memset(&info, 0, sizeof(info));
 
@@ -1668,13 +1654,13 @@ int process_plocks(void)
 	if (rv < 0) {
 		log_debug("process_plocks: read error %d fd %d\n",
 			  errno, control_fd);
-		return 0;
+		return;
 	}
 
 	/* kernel doesn't set the nodeid field */
 	info.nodeid = our_nodeid;
 
-	if (!plocks_online) {
+	if (!cfgd_enable_plock) {
 		rv = -ENOSYS;
 		goto fail;
 	}
@@ -1727,20 +1713,20 @@ int process_plocks(void)
 		save_pending_plock(mg, r, &info);
 	}
 
-	if (config_plock_ownership &&
+	if (cfgd_plock_ownership &&
 	    time_diff_ms(&mg->drop_resources_last, &now) >=
-	    		 config_drop_resources_time) {
+	    		 cfgd_drop_resources_time) {
 		mg->drop_resources_last = now;
 		drop_resources(mg);
 	}
 
-	return 0;
+	return;
 
  fail:
 	info.rv = rv;
 	rv = write(control_fd, &info, sizeof(info));
 
-	return 0;
+	return;
 }
 
 void process_saved_plocks(struct mountgroup *mg)
@@ -1778,7 +1764,7 @@ void process_saved_plocks(struct mountgroup *mg)
 
 void plock_exit(void)
 {
-	if (plocks_online)
+	if (cfgd_enable_plock)
 		saCkptFinalize(ckpt_handle);
 }
 
@@ -1850,7 +1836,7 @@ static int unpack_section_buf(struct mountgroup *mg, char *numbuf, int buflen)
 	INIT_LIST_HEAD(&r->waiters);
 	INIT_LIST_HEAD(&r->pending);
 
-	if (config_plock_ownership)
+	if (cfgd_plock_ownership)
 		sscanf(numbuf, "r%llu.%d", &num, &owner);
 	else
 		sscanf(numbuf, "r%llu", &num);
@@ -1884,11 +1870,11 @@ static int unpack_section_buf(struct mountgroup *mg, char *numbuf, int buflen)
 		pp++;
 	}
 
-	list_add_tail(&r->list, &mg->resources);
+	list_add_tail(&r->list, &mg->plock_resources);
 	return 0;
 }
 
-int _unlink_checkpoint(struct mountgroup *mg, SaNameT *name)
+static int _unlink_checkpoint(struct mountgroup *mg, SaNameT *name)
 {
 	SaCkptCheckpointHandleT h;
 	SaCkptCheckpointDescriptorT s;
@@ -2003,7 +1989,7 @@ void store_plocks(struct mountgroup *mg, int nodeid)
 	int r_count, lock_count, total_size, section_size, max_section_size;
 	int len, owner;
 
-	if (!plocks_online)
+	if (!cfgd_enable_plock)
 		return;
 
 	/* no change to plock state since we created the last checkpoint */
@@ -2031,7 +2017,7 @@ void store_plocks(struct mountgroup *mg, int nodeid)
 	total_size = 0;
 	max_section_size = 0;
 
-	list_for_each_entry(r, &mg->resources, list) {
+	list_for_each_entry(r, &mg->plock_resources, list) {
 		if (r->owner == -1)
 			continue;
 
@@ -2097,7 +2083,7 @@ void store_plocks(struct mountgroup *mg, int nodeid)
 	   - If r owner is 0 and got_unown, then ckpt owner 0 and all plocks;
 	     (there should be no SYNCING plocks) */
 
-	list_for_each_entry(r, &mg->resources, list) {
+	list_for_each_entry(r, &mg->plock_resources, list) {
 		if (r->owner == -1)
 			continue;
 		else if (r->owner == our_nodeid)
@@ -2115,7 +2101,7 @@ void store_plocks(struct mountgroup *mg, int nodeid)
 		}
 
 		memset(&buf, 0, sizeof(buf));
-		if (config_plock_ownership)
+		if (cfgd_plock_ownership)
 			len = snprintf(buf, SECTION_NAME_LEN, "r%llu.%d",
 			       	       (unsigned long long)r->number, owner);
 		else
@@ -2183,7 +2169,7 @@ void retrieve_plocks(struct mountgroup *mg)
 	char buf[SECTION_NAME_LEN];
 	int len;
 
-	if (!plocks_online)
+	if (!cfgd_enable_plock)
 		return;
 
 	log_group(mg, "retrieve_plocks");
@@ -2302,7 +2288,10 @@ void purge_plocks(struct mountgroup *mg, int nodeid, int unmount)
 	struct resource *r, *r2;
 	int purged = 0;
 
-	list_for_each_entry_safe(r, r2, &mg->resources, list) {
+	if (!cfgd_enable_plock)
+		return;
+
+	list_for_each_entry_safe(r, r2, &mg->plock_resources, list) {
 		list_for_each_entry_safe(po, po2, &r->locks, list) {
 			if (po->nodeid == nodeid || unmount) {
 				list_del(&po->list);
@@ -2327,17 +2316,17 @@ void purge_plocks(struct mountgroup *mg, int nodeid, int unmount)
 			r->owner = 0;
 			send_pending_plocks(mg, r);
 		}
-		
+
 		if (!list_empty(&r->waiters))
 			do_waiters(mg, r);
 
-		if (!config_plock_ownership &&
+		if (!cfgd_plock_ownership &&
 		    list_empty(&r->locks) && list_empty(&r->waiters)) {
 			list_del(&r->list);
 			free(r);
 		}
 	}
-	
+
 	if (purged)
 		mg->last_plock_time = time(NULL);
 
@@ -2351,25 +2340,20 @@ void purge_plocks(struct mountgroup *mg, int nodeid, int unmount)
 		unlink_checkpoint(mg);
 }
 
-int dump_plocks(char *name, int fd)
+int fill_plock_dump_buf(struct mountgroup *mg)
 {
-	struct mountgroup *mg;
 	struct posix_lock *po;
 	struct lock_waiter *w;
 	struct resource *r;
-	char line[MAXLINE];
-	int rv;
-
-	if (!name)
-		return -1;
+	int rv = 0;
+	int len = GFSC_DUMP_SIZE, pos = 0, ret;
 
-	mg = find_mg(name);
-	if (!mg)
-		return -1;
+	memset(plock_dump_buf, 0, sizeof(plock_dump_buf));
+	plock_dump_len = 0;
 
-	list_for_each_entry(r, &mg->resources, list) {
+	list_for_each_entry(r, &mg->plock_resources, list) {
 		list_for_each_entry(po, &r->locks, list) {
-			snprintf(line, MAXLINE,
+			ret = snprintf(plock_dump_buf + pos, len - pos,
 			      "%llu %s %llu-%llu nodeid %d pid %u owner %llx\n",
 			      (unsigned long long)r->number,
 			      po->ex ? "WR" : "RD",
@@ -2378,11 +2362,15 @@ int dump_plocks(char *name, int fd)
 			      po->nodeid, po->pid,
 			      (unsigned long long)po->owner);
 
-			rv = do_write(fd, line, strlen(line));
+			if (ret >= len - pos) {
+				rv = -ENOSPC;
+				goto out;
+			}
+			pos += ret;
 		}
 
 		list_for_each_entry(w, &r->waiters, list) {
-			snprintf(line, MAXLINE,
+			ret = snprintf(plock_dump_buf + pos, len - pos,
 			      "%llu WAITING %s %llu-%llu nodeid %d pid %u owner %llx\n",
 			      (unsigned long long)r->number,
 			      w->info.ex ? "WR" : "RD",
@@ -2391,10 +2379,14 @@ int dump_plocks(char *name, int fd)
 			      w->info.nodeid, w->info.pid,
 			      (unsigned long long)w->info.owner);
 
-			rv = do_write(fd, line, strlen(line));
+			if (ret >= len - pos) {
+				rv = -ENOSPC;
+				goto out;
+			}
+			pos += ret;
 		}
 	}
-
-	return 0;
+ out:
+	return rv;
 }
 
diff --git a/group/gfs_controld/recover.c b/group/gfs_controld/recover.c
deleted file mode 100644
index 52d96ff..0000000
--- a/group/gfs_controld/recover.c
+++ /dev/null
@@ -1,2805 +0,0 @@
-/******************************************************************************
-*******************************************************************************
-**
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
-**
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
-**
-*******************************************************************************
-******************************************************************************/
-
-#include "lock_dlm.h"
-#include "ccs.h"
-
-#define SYSFS_DIR	"/sys/fs"
-#define JID_INIT	-9
-
-extern char *clustername;
-extern int our_nodeid;
-extern group_handle_t gh;
-extern int config_no_withdraw;
-extern int dmsetup_wait;
-
-struct list_head mounts;
-struct list_head withdrawn_mounts;
-
-void send_journals(struct mountgroup *mg, int nodeid);
-void start_spectator_init_2(struct mountgroup *mg);
-void start_spectator_2(struct mountgroup *mg);
-void notify_mount_client(struct mountgroup *mg);
-
-
-int set_sysfs(struct mountgroup *mg, char *field, int val)
-{
-	char fname[512];
-	char out[16];
-	int rv, fd;
-
-	snprintf(fname, 512, "%s/%s/%s/lock_module/%s",
-		 SYSFS_DIR, mg->type, mg->table, field);
-
-	log_group(mg, "set %s to %d", fname, val);
-
-	fd = open(fname, O_RDWR);
-	if (fd < 0) {
-		log_group(mg, "set open %s error %d %d", fname, fd, errno);
-		return -1;
-	}
-
-	mg->got_kernel_mount = 1;
-
-	memset(out, 0, 16);
-	sprintf(out, "%d", val);
-	rv = write(fd, out, strlen(out));
-
-	if (rv != strlen(out)) {
-		log_error("write %s error %d %d", fname, fd, errno);
-		close(fd);
-		return -1;
-	}
-
-	close(fd);
-	return 0;
-}
-
-int get_sysfs(struct mountgroup *mg, char *field, char *buf, int len)
-{
-	char fname[512], *p;
-	int fd, rv;
-
-	snprintf(fname, 512, "%s/%s/%s/lock_module/%s",
-		 SYSFS_DIR, mg->type, mg->table, field);
-
-	fd = open(fname, O_RDONLY);
-	if (fd < 0) {
-		log_group(mg, "get open %s error %d %d", fname, fd, errno);
-		return -1;
-	}
-
-	mg->got_kernel_mount = 1;
-
-	rv = read(fd, buf, len);
-	if (rv < 0)
-		log_error("read %s error %d %d", fname, rv, errno);
-	else {
-		rv = 0;
-		p = strchr(buf, '\n');
-		if (p)
-			*p = '\0';
-	}
-
-	close(fd);
-	return rv;
-}
-
-struct mg_member *find_memb_nodeid(struct mountgroup *mg, int nodeid)
-{
-	struct mg_member *memb;
-
-	list_for_each_entry(memb, &mg->members, list) {
-		if (memb->nodeid == nodeid)
-			return memb;
-	}
-	return NULL;
-}
-
-struct mg_member *find_memb_jid(struct mountgroup *mg, int jid)
-{
-	struct mg_member *memb;
-
-	list_for_each_entry(memb, &mg->members, list) {
-		if (memb->jid == jid)
-			return memb;
-	}
-	return NULL;
-}
-
-int first_mounter_recovery(struct mountgroup *mg)
-{
-	struct mg_member *memb;
-
-	list_for_each_entry(memb, &mg->members, list) {
-		if (memb->opts & MEMB_OPT_RECOVER)
-			return memb->nodeid;
-	}
-	return 0;
-}
-
-int local_first_mounter_recovery(struct mountgroup *mg)
-{
-	int nodeid;
-
-	nodeid = first_mounter_recovery(mg);
-	if (nodeid == our_nodeid)
-		return 1;
-	return 0;
-}
-
-int remote_first_mounter_recovery(struct mountgroup *mg)
-{
-	int nodeid;
-
-	nodeid = first_mounter_recovery(mg);
-	if (nodeid && (nodeid != our_nodeid))
-		return 1;
-	return 0;
-}
-
-static void start_done(struct mountgroup *mg)
-{
-	log_group(mg, "start_done %d", mg->start_event_nr);
-	group_start_done(gh, mg->name, mg->start_event_nr);
-}
-
-void notify_remount_client(struct mountgroup *mg, char *msg)
-{
-	char buf[MAXLINE];
-	int rv;
-
-	memset(buf, 0, MAXLINE);
-	snprintf(buf, MAXLINE, "%s", msg);
-
-	log_debug("notify_remount_client: %s", buf);
-
-	rv = client_send(mg->remount_client, buf, MAXLINE);
-	if (rv < 0)
-		log_error("notify_remount_client: send failed %d", rv);
-
-	mg->remount_client = 0;
-}
-
-void send_withdraw(struct mountgroup *mg)
-{
-	struct gdlm_header *hd;
-	int len;
-	char *buf;
-
-	len = sizeof(struct gdlm_header);
-
-	buf = malloc(len);
-	if (!buf)
-		return;
-	memset(buf, 0, len);
-
-	hd = (struct gdlm_header *)buf;
-	hd->type = MSG_WITHDRAW;
-	hd->nodeid = our_nodeid;
-	hd->to_nodeid = 0;
-
-	log_group(mg, "send_withdraw");
-
-	send_group_message(mg, len, buf);
-
-	free(buf);
-}
-
-void receive_withdraw(struct mountgroup *mg, char *buf, int len, int from)
-{
-	struct mg_member *memb;
-
-	memb = find_memb_nodeid(mg, from);
-	if (!memb) {
-		log_group(mg, "receive_withdraw no member %d", from);
-		return;
-	}
-	log_group(mg, "receive_withdraw from %d", from);
-	memb->withdrawing = 1;
-
-	if (from == our_nodeid)
-		group_leave(gh, mg->name);
-}
-
-#define SEND_RS_INTS 3
-
-void send_recovery_status(struct mountgroup *mg)
-{
-	struct gdlm_header *hd;
-	struct mg_member *memb;
-	int len, *p, i, n = 0;
-	char *buf;
-
-	list_for_each_entry(memb, &mg->members_gone, list) {
-		if (memb->local_recovery_status == RS_SUCCESS)
-			n++;
-	}
-
-	len = sizeof(struct gdlm_header) + (n * SEND_RS_INTS * sizeof(int));
-
-	buf = malloc(len);
-	if (!buf)
-		return;
-	memset(buf, 0, len);
-
-	hd = (struct gdlm_header *)buf;
-	hd->type = MSG_RECOVERY_STATUS;
-	hd->nodeid = our_nodeid;
-	hd->to_nodeid = 0;
-	p = (int *) (buf + sizeof(struct gdlm_header));
-
-	i = 0;
-	list_for_each_entry(memb, &mg->members_gone, list) {
-		if (memb->local_recovery_status != RS_SUCCESS)
-			continue;
-		p[i] = cpu_to_le32(memb->nodeid);
-		i++;
-		p[i] = cpu_to_le32(memb->jid);
-		i++;
-		p[i] = cpu_to_le32(memb->local_recovery_status);
-		i++;
-	}
-
-	log_group(mg, "send_recovery_status for %d nodes len %d", n, len);
-
-	send_group_message(mg, len, buf);
-
-	free(buf);
-}
-
-/* Note: we can get more than one node reporting success in recovering
-   the journal for a failed node.  The first has really recovered it,
-   the rest have found the fs clean and report success. */
-
-void _receive_recovery_status(struct mountgroup *mg, char *buf, int len,
-			      int from)
-{
-	struct mg_member *memb;
-	int *p, n, i, nodeid, jid, status, found = 0;
-
-	n = (len - sizeof(struct gdlm_header)) / (SEND_RS_INTS * sizeof(int));
-
-	p = (int *) (buf + sizeof(struct gdlm_header));
-
-	for (i = 0; i < n; i++) {
-		nodeid = le32_to_cpu(p[i * SEND_RS_INTS]);
-		jid    = le32_to_cpu(p[i * SEND_RS_INTS + 1]);
-		status = le32_to_cpu(p[i * SEND_RS_INTS + 2]);
-
-		ASSERT(status == RS_SUCCESS);
-
-		found = 0;
-		list_for_each_entry(memb, &mg->members_gone, list) {
-			if (memb->nodeid != nodeid)
-				continue;
-			ASSERT(memb->jid == jid);
-			ASSERT(memb->recovery_status == RS_NEED_RECOVERY ||
-			       memb->recovery_status == RS_SUCCESS);
-			memb->recovery_status = status;
-			found = 1;
-			break;
-		}
-
-		log_group(mg, "receive_recovery_status from %d len %d "
-			  "nodeid %d jid %d status %d found %d",
-			  from, len, nodeid, jid, status, found);
-	}
-
-	if (from == our_nodeid)
-		start_done(mg);
-}
-
-void process_saved_recovery_status(struct mountgroup *mg)
-{
-	struct save_msg *sm, *sm2;
-
-	if (list_empty(&mg->saved_messages))
-		return;
-
-	log_group(mg, "process_saved_recovery_status");
-
-	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
-		if (sm->type != MSG_RECOVERY_STATUS)
-			continue;
-		_receive_recovery_status(mg, sm->buf, sm->len, sm->nodeid);
-		list_del(&sm->list);
-		free(sm);
-	}
-}
-
-void assign_next_first_mounter(struct mountgroup *mg)
-{
-	struct mg_member *memb, *next = NULL;
-	int low = -1;
-
-	list_for_each_entry(memb, &mg->members, list) {
-		if (memb->jid == -2)
-			continue;
-		if (memb->jid == -9)
-			continue;
-		if (memb->spectator || memb->readonly || memb->withdrawing ||
-		    memb->ms_kernel_mount_done)
-			continue;
-		if (low == -1 || memb->nodeid < low) {
-			next = memb;
-			low = memb->nodeid;
-		}
-	}
-
-	if (next) {
-		log_group(mg, "next first mounter is %d jid %d opts %x",
-			  next->nodeid, next->jid, next->opts);
-		next->opts |= MEMB_OPT_RECOVER;
-		ASSERT(next->jid >= 0);
-	} else
-		log_group(mg, "no next mounter available yet");
-}
-
-#define SEND_MS_INTS 4
-
-void send_mount_status(struct mountgroup *mg)
-{
-	struct gdlm_header *hd;
-	int len, *p;
-	char *buf;
-
-	len = sizeof(struct gdlm_header) + (SEND_MS_INTS * sizeof(int));
-
-	buf = malloc(len);
-	if (!buf)
-		return;
-	memset(buf, 0, len);
-
-	hd = (struct gdlm_header *)buf;
-	hd->type = MSG_MOUNT_STATUS;
-	hd->nodeid = our_nodeid;
-	hd->to_nodeid = 0;
-
-	p = (int *) (buf + sizeof(struct gdlm_header));
-
-	p[0] = cpu_to_le32(mg->first_mounter);
-	p[1] = cpu_to_le32(mg->kernel_mount_error);
-	p[2] = 0; /* unused */
-	p[3] = 0; /* unused */
-
-	log_group(mg, "send_mount_status kernel_mount_error %d "
-		      "first_mounter %d",
-		      mg->kernel_mount_error,
-		      mg->first_mounter);
-
-	send_group_message(mg, len, buf);
-
-	free(buf);
-}
-
-void _receive_mount_status(struct mountgroup *mg, char *buf, int len, int from)
-{
-	struct mg_member *memb, *us;
-	int *p;
-
-	p = (int *) (buf + sizeof(struct gdlm_header));
-
-	memb = find_memb_nodeid(mg, from);
-	if (!memb) {
-		log_group(mg, "_receive_mount_status no node %d", from);
-		return;
-	}
-
-	memb->ms_kernel_mount_done = 1;
-	memb->ms_first_mounter = le32_to_cpu(p[0]);
-	memb->ms_kernel_mount_error = le32_to_cpu(p[1]);
-
-	log_group(mg, "_receive_mount_status from %d kernel_mount_error %d "
-		      "first_mounter %d opts %x", from,
-		      memb->ms_kernel_mount_error, memb->ms_first_mounter,
-		      memb->opts);
-
-	if (memb->opts & MEMB_OPT_RECOVER) {
-		ASSERT(memb->ms_first_mounter);
-	}
-	if (memb->ms_first_mounter) {
-		ASSERT(memb->opts & MEMB_OPT_RECOVER);
-	}
-
-	if (memb->ms_first_mounter) {
-		memb->opts &= ~MEMB_OPT_RECOVER;
-
-		if (!memb->ms_kernel_mount_error) {
-			/* the first mounter has successfully mounted, we can
-			   go ahead and mount now */
-
-			if (mg->mount_client_delay) {
-				mg->mount_client_delay = 0;
-				notify_mount_client(mg);
-			}
-		} else {
-			/* first mounter mount failed, next low node should be
-			   made first mounter */
-
-			memb->jid = -2;
-			if (from == our_nodeid)
-				mg->our_jid = -2;
-
-			assign_next_first_mounter(mg);
-
-			/* if we became the next first mounter, then notify
-			   mount client */
-
-			us = find_memb_nodeid(mg, our_nodeid);
-			if (us->opts & MEMB_OPT_RECOVER) {
-				log_group(mg, "we are next first mounter");
-				mg->first_mounter = 1;
-				mg->first_mounter_done = 0;
-				mg->mount_client_delay = 0;
-				notify_mount_client(mg);
-			}
-		}
-	}
-}
-
-void receive_mount_status(struct mountgroup *mg, char *buf, int len, int from)
-{
-	log_group(mg, "receive_mount_status from %d len %d last_cb %d",
-		  from, len, mg->last_callback);
-
-	if (!mg->got_our_options) {
-		log_group(mg, "ignore mount_status from %d", from);
-		return;
-	}
-
-	if (!mg->got_our_journals)
-		save_message(mg, buf, len, from, MSG_MOUNT_STATUS);
-	else
-		_receive_mount_status(mg, buf, len, from);
-}
-
-/* We delay processing mount_status msesages until we receive the journals
-   message for our own mount.  Our journals message is a snapshot of the memb
-   list at the time our options message is received on the remote node.  We
-   ignore any messages that would change the memb list prior to seeing our own
-   options message and we save any messages that would change the memb list
-   after seeing our own options message and before we receive the memb list
-   from the journals message. */
-
-void process_saved_mount_status(struct mountgroup *mg)
-{
-	struct save_msg *sm, *sm2;
-
-	if (list_empty(&mg->saved_messages))
-		return;
-
-	log_group(mg, "process_saved_mount_status");
-
-	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
-		if (sm->type != MSG_MOUNT_STATUS)
-			continue;
-		_receive_mount_status(mg, sm->buf, sm->len, sm->nodeid);
-		list_del(&sm->list);
-		free(sm);
-	}
-}
-
-char *msg_name(int type)
-{
-	switch (type) {
-	case MSG_JOURNAL:
-		return "MSG_JOURNAL";
-	case MSG_OPTIONS:
-		return "MSG_OPTIONS";
-	case MSG_REMOUNT:
-		return "MSG_REMOUNT";
-	case MSG_PLOCK:
-		return "MSG_PLOCK";
-	case MSG_MOUNT_STATUS:
-		return "MSG_MOUNT_STATUS";
-	case MSG_RECOVERY_STATUS:
-		return "MSG_RECOVERY_STATUS";
-	case MSG_RECOVERY_DONE:
-		return "MSG_RECOVERY_DONE";
-	case MSG_WITHDRAW:
-		return "MSG_WITHDRAW";
-	}
-	return "unknown";
-}
-
-/* we can receive recovery_status messages from other nodes doing start before
-   we actually process the corresponding start callback ourselves */
-
-void save_message(struct mountgroup *mg, char *buf, int len, int from, int type)
-{
-	struct save_msg *sm;
-
-	sm = malloc(sizeof(struct save_msg) + len);
-	if (!sm)
-		return;
-	memset(sm, 0, sizeof(struct save_msg) + len);
-
-	memcpy(&sm->buf, buf, len);
-	sm->type = type;
-	sm->len = len;
-	sm->nodeid = from;
-
-	log_group(mg, "save %s from %d len %d", msg_name(type), from, len);
-
-	list_add_tail(&sm->list, &mg->saved_messages);
-}
-
-void receive_recovery_status(struct mountgroup *mg, char *buf, int len,
-			     int from)
-{
-	switch (mg->last_callback) {
-	case DO_STOP:
-		save_message(mg, buf, len, from, MSG_RECOVERY_STATUS);
-		break;
-	case DO_START:
-		_receive_recovery_status(mg, buf, len, from);
-		break;
-	default:
-		log_group(mg, "receive_recovery_status %d last_callback %d",
-			  from, mg->last_callback);
-	}
-}
-
-/* tell others that all journals are recovered; they should clear
-   memb's from members_gone, clear needs_recovery and unblock locks */
-
-void send_recovery_done(struct mountgroup *mg)
-{
-	struct gdlm_header *hd;
-	int len;
-	char *buf;
-
-	len = sizeof(struct gdlm_header);
-
-	buf = malloc(len);
-	if (!buf)
-		return;
-	memset(buf, 0, len);
-
-	hd = (struct gdlm_header *)buf;
-	hd->type = MSG_RECOVERY_DONE;
-	hd->nodeid = our_nodeid;
-	hd->to_nodeid = 0;
-
-	send_group_message(mg, len, buf);
-
-	free(buf);
-}
-
-void receive_recovery_done(struct mountgroup *mg, char *buf, int len, int from)
-{
-	struct mg_member *memb, *safe;
-
-	log_group(mg, "receive_recovery_done from %d needs_recovery %d",
-		  from, mg->needs_recovery);
-
-	list_for_each_entry_safe(memb, safe, &mg->members_gone, list) {
-		log_group(mg, "receive_recovery_done clear jid %d nodeid %d",
-			  memb->jid, memb->nodeid);
-		list_del(&memb->list);
-		free(memb);
-	}
-
-	mg->needs_recovery = 0;
-	set_sysfs(mg, "block", 0);
-}
-
-void send_remount(struct mountgroup *mg, int ro)
-{
-	struct gdlm_header *hd;
-	int len;
-	char *buf;
-
-	len = sizeof(struct gdlm_header) + MAX_OPTIONS_LEN;
-
-	buf = malloc(len);
-	if (!buf)
-		return;
-	memset(buf, 0, len);
-
-	hd = (struct gdlm_header *)buf;
-	hd->type = MSG_REMOUNT;
-	hd->nodeid = our_nodeid;
-	hd->to_nodeid = 0;
-
-	strcpy(buf+sizeof(struct gdlm_header), ro ? "ro" : "rw");
-
-	log_group(mg, "send_remount len %d \"%s\"", len,
-		  buf+sizeof(struct gdlm_header));
-
-	send_group_message(mg, len, buf);
-
-	free(buf);
-}
-
-void receive_remount(struct mountgroup *mg, char *buf, int len, int from)
-{
-	struct mg_member *memb;
-	char *options, *msg = "ok";
-	int rw = 0, ro = 0, error = 0;
-
-	options = (char *) (buf + sizeof(struct gdlm_header));
-
-	memb = find_memb_nodeid(mg, from);
-	if (!memb) {
-		log_error("receive_remount: unknown nodeid %d", from);
-		return;
-	}
-
-	if (strstr(options, "rw"))
-		rw = 1;
-	else if (strstr(options, "ro"))
-		ro = 1;
-	else {
-		msg = "error: invalid option";
-		error = -1;
-		goto out;
-	}
-
-	/* FIXME: check if we've even fully completed our normal mount yet
-	   (received our own mount-status?)  if not, then disallow remount */
-
-	/* FIXME: going ro->rw may mean we can now do journal or first-mounter
-	   recovery that we couldn't do before. */
-
-	memb->readonly = ro;
-	memb->rw = !ro;
-
-	if (ro) {
-		memb->opts &= ~MEMB_OPT_RW;
-		memb->opts |= MEMB_OPT_RO;
-	} else {
-		memb->opts &= ~MEMB_OPT_RO;
-		memb->opts |= MEMB_OPT_RW;
-	}
- out:
-	if (from == our_nodeid) {
-		if (!error) {
-			mg->rw = memb->rw;
-			mg->readonly = memb->readonly;
-		}
-		notify_remount_client(mg, msg);
-	}
-
-	log_group(mg, "receive_remount from %d error %d rw=%d ro=%d opts=%x",
-		  from, error, memb->rw, memb->readonly, memb->opts);
-}
-
-void set_our_memb_options(struct mountgroup *mg)
-{
-	struct mg_member *memb;
-	memb = find_memb_nodeid(mg, our_nodeid);
-	ASSERT(memb);
-
-	if (mg->readonly) {
-		memb->readonly = 1;
-		memb->opts |= MEMB_OPT_RO;
-	} else if (mg->spectator) {
-		memb->spectator = 1;
-		memb->opts |= MEMB_OPT_SPECT;
-	} else if (mg->rw) {
-		memb->rw = 1;
-		memb->opts |= MEMB_OPT_RW;
-	}
-}
-
-void send_options(struct mountgroup *mg)
-{
-	struct gdlm_header *hd;
-	int len;
-	char *buf;
-
-	len = sizeof(struct gdlm_header) + MAX_OPTIONS_LEN;
-
-	buf = malloc(len);
-	if (!buf)
-		return;
-	memset(buf, 0, len);
-
-	hd = (struct gdlm_header *)buf;
-	hd->type = MSG_OPTIONS;
-	hd->nodeid = our_nodeid;
-	hd->to_nodeid = 0;
-
-	strncpy(buf+sizeof(struct gdlm_header), mg->options, MAX_OPTIONS_LEN-1);
-
-	log_group(mg, "send_options len %d \"%s\"", len,
-		  buf+sizeof(struct gdlm_header));
-
-	send_group_message(mg, len, buf);
-
-	free(buf);
-}
-
-/* We set the new member's jid to the lowest unused jid.  If we're the lowest
-   existing member (by nodeid), then send jid info to the new node. */
-
-/* Look at rw/ro/spectator status of all existing mounters and whether
-   we need to do recovery.  Based on that, decide if the current mount
-   mode (ro/spectator) is permitted; if not, set jid = -2.  If spectator
-   mount and it's ok, set jid = -1.  If ro or rw mount and it's ok, set
-   real jid. */
-
-int assign_journal(struct mountgroup *mg, struct mg_member *new)
-{
-	struct mg_member *memb, *memb_recover = NULL, *memb_mounted = NULL;
-	int i, total, rw_count, ro_count, spect_count, invalid_count;
-
-	total = rw_count = ro_count = spect_count = invalid_count = 0;
-
-	list_for_each_entry(memb, &mg->members, list) {
-		if (memb->nodeid == new->nodeid)
-			continue;
-		total++;
-		if (memb->jid == -2)
-			invalid_count++;
-		else if (memb->spectator)
-			spect_count++;
-		else if (memb->rw)
-			rw_count++;
-		else if (memb->readonly)
-			ro_count++;
-
-		if (memb->opts & MEMB_OPT_RECOVER) {
-			memb_recover = memb;
-			log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
-				  memb->nodeid);
-		}
-
-		if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
-			memb_mounted = memb;
-	}
-
-	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
-		  "needs_recovery %d", total, invalid_count, rw_count,
-		  ro_count, spect_count, mg->needs_recovery);
-
-	if (new->spectator) {
-		log_group(mg, "assign_journal: new spectator allowed");
-		new->jid = -1;
-		goto out;
-	}
-
-	for (i = 0; i < 1024; i++) {
-		memb = find_memb_jid(mg, i);
-		if (!memb) {
-			new->jid = i;
-			break;
-		}
-	}
-
-	/* Repeat first-mounter recovery: the fs has been mounted and in-use,
-	   but nodes have failed and none of the current mounters has been able
-	   to do recovery (all remaining nodes may be ro/spect for example).
-	   This puts us into the special "needs_recovery" state where new
-	   mounters are asked to do first-mounter recovery of the fs while
-	   the current mounters sit in a blocked state. */
-	   
-	if (mg->needs_recovery) {
-		if (!memb_recover) {
-			log_group(mg, "assign_journal: needs_recovery: "
-				  "new memb %d gets OPT_RECOVER",
-				  new->nodeid);
-			new->opts |= MEMB_OPT_RECOVER;
-		} else {
-			log_group(mg, "assign_journal: needs_recovery: "
-				  "new memb %d memb %d has OPT_RECOVER",
-				  new->nodeid, memb_recover->nodeid);
-		}
-		goto out;
-	}
-
-	/* Initial first-mounter recovery: the fs is coming online, the first
-	   mg member assumes first-mounter role and other nodes join the mg
-	   while the first-mounter is working.  These non-first mounters wait
-	   for the first-mounter to finish before notifying mount.gfs.  If the
-	   first-mounter fails, one of them will become the first-mounter. */
-
-	/* it shouldn't be possible to have someone doing first mounter
-	   recovery and also have someone with the fs fully mounted */
-
-	if (memb_mounted && memb_recover) {
-		log_group(mg, "memb_mounted %d memb_recover %d",
-			  memb_mounted->nodeid, memb_recover->nodeid);
-		ASSERT(0);
-	}
-
-	/* someone has successfully mounted the fs which means the fs doesn't
-	   need first mounter recovery */
-
-	if (memb_mounted) {
-		log_group(mg, "assign_journal: no first recovery needed %d",
-			  memb_mounted->nodeid);
-		goto out;
-	}
-
-	/* someone is currently doing first mounter recovery, they'll send
-	   mount_status when they're done letting everyone know the result */
-
-	if (memb_recover) {
-		log_group(mg, "assign_journal: %d doing first recovery",
-			  memb_recover->nodeid);
-		goto out;
-	}
-
-	/* when we received our journals, no one was flagged with OPT_RECOVER
-	   which means no first mounter recovery is needed or is current */
-
-	if (mg->global_first_recover_done) {
-		log_group(mg, "assign_journal: global_first_recover_done");
-		goto out;
-	}
-
-	/* no one has done kernel mount successfully and no one is doing first
-	   mounter recovery, the new node gets to try first mounter recovery */
-
-	log_group(mg, "kernel_mount_done %d kernel_mount_error %d "
-		      "first_mounter %d first_mounter_done %d",
-		      mg->kernel_mount_done, mg->kernel_mount_error,
-		      mg->first_mounter, mg->first_mounter_done);
-
-	log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
-		  "fs not mounted", new->nodeid);
-	new->opts |= MEMB_OPT_RECOVER;
-
- out:
-	log_group(mg, "assign_journal: new member %d got jid %d opts %x",
-		  new->nodeid, new->jid, new->opts);
-
-	if (mg->master_nodeid == our_nodeid) {
-		store_plocks(mg, new->nodeid);
-		send_journals(mg, new->nodeid);
-	}
-	return 0;
-}
-
-void _receive_options(struct mountgroup *mg, char *buf, int len, int from)
-{
-	struct mg_member *memb;
-	struct gdlm_header *hd;
-	char *options;
-
-	hd = (struct gdlm_header *)buf;
-	options = (char *) (buf + sizeof(struct gdlm_header));
-
-	memb = find_memb_nodeid(mg, from);
-	if (!memb) {
-		log_error("unknown nodeid %d for options message", from);
-		return;
-	}
-
-	if (strstr(options, "spectator")) {
-		memb->spectator = 1;
-		memb->opts |= MEMB_OPT_SPECT;
-	} else if (strstr(options, "rw")) {
-		memb->rw = 1;
-		memb->opts |= MEMB_OPT_RW;
-	} else if (strstr(options, "ro")) {
-		memb->readonly = 1;
-		memb->opts |= MEMB_OPT_RO;
-	}
-
-	log_group(mg, "_receive_options from %d rw=%d ro=%d spect=%d opts=%x",
-		  from, memb->rw, memb->readonly, memb->spectator, memb->opts);
-
-	assign_journal(mg, memb);
-}
-
-void receive_options(struct mountgroup *mg, char *buf, int len, int from)
-{
-	struct gdlm_header *hd = (struct gdlm_header *)buf;
-	struct mg_member *memb;
-
-	log_group(mg, "receive_options from %d len %d last_cb %d",
-		  from, len, mg->last_callback);
-
-	if (hd->nodeid == our_nodeid) {
-		mg->got_our_options = 1;
-		mg->save_plocks = 1;
-		return;
-	}
-
-	if (!mg->got_our_options) {
-		log_group(mg, "ignore options from %d", from);
-		return;
-	}
-
-	/* we can receive an options message before getting the start
-	   that adds the mounting node that sent the options, or
-	   we can receive options messages before we get the journals
-	   message for out own mount */
-
-	memb = find_memb_nodeid(mg, from);
-
-	if (!memb || !mg->got_our_journals)
-		save_message(mg, buf, len, from, MSG_OPTIONS);
-	else
-		_receive_options(mg, buf, len, from);
-}
-
-void process_saved_options(struct mountgroup *mg)
-{
-	struct save_msg *sm, *sm2;
-
-	if (list_empty(&mg->saved_messages))
-		return;
-
-	log_group(mg, "process_saved_options");
-
-	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
-		if (sm->type != MSG_OPTIONS)
-			continue;
-		_receive_options(mg, sm->buf, sm->len, sm->nodeid);
-		list_del(&sm->list);
-		free(sm);
-	}
-}
-
-#define NUM 3
-
-/* send nodeid/jid/opts of every member to nodeid */
-
-void send_journals(struct mountgroup *mg, int nodeid)
-{
-	struct mg_member *memb;
-	struct gdlm_header *hd;
-	int i, len;
-	char *buf;
-	int *ids;
-
-	len = sizeof(struct gdlm_header) + (mg->memb_count * NUM * sizeof(int));
-
-	buf = malloc(len);
-	if (!buf)
-		return;
-	memset(buf, 0, len);
-
-	hd = (struct gdlm_header *)buf;
-	hd->type = MSG_JOURNAL;
-	hd->nodeid = our_nodeid;
-	hd->to_nodeid = nodeid;
-	ids = (int *) (buf + sizeof(struct gdlm_header));
-
-	i = 0;
-	list_for_each_entry(memb, &mg->members, list) {
-		ids[i] = cpu_to_le32(memb->nodeid);
-		i++;
-		ids[i] = cpu_to_le32(memb->jid);
-		i++;
-		ids[i] = cpu_to_le32(memb->opts);
-		i++;
-	}
-
-	log_group(mg, "send_journals to %d len %d count %d", nodeid, len, i);
-
-	send_group_message(mg, len, buf);
-
-	free(buf);
-}
-
-void received_our_jid(struct mountgroup *mg)
-{
-	log_group(mg, "received_our_jid %d", mg->our_jid);
-
-	/* we've been given jid of -2 which means we're not permitted
-	   to mount the fs; probably because we're trying to mount readonly
-	   but the next mounter is required to be rw */
-
-	if (mg->our_jid == -2) {
-		strcpy(mg->error_msg, "error: jid is -2, try rw");
-		goto out;
-	}
-
-	/* fs needs recovery and existing mounters can't recover it,
-	   i.e. they're spectator/readonly or the first mounter's
-	   mount(2) failed, so we're told to do first-mounter recovery
-	   on the fs. */
-
-	if (local_first_mounter_recovery(mg)) {
-		log_group(mg, "we're told to do first mounter recovery");
-		mg->first_mounter = 1;
-		mg->first_mounter_done = 0;
-		mg->mount_client_delay = 0;
-		mg->save_plocks = 0;
-		goto out;
-	} else if (remote_first_mounter_recovery(mg)) {
-		/* delay notifying mount client until we get a successful
-		   mount status from the first mounter */
-		log_group(mg, "other node doing first mounter recovery, "
-			  "set mount_client_delay");
-		mg->mount_client_delay = 1;
-		mg->save_plocks = 0;
-		return;
-	}
-
-	retrieve_plocks(mg);
-	mg->save_plocks = 0;
-	process_saved_plocks(mg);
- out:
-	notify_mount_client(mg);
-}
-
-void _receive_journals(struct mountgroup *mg, char *buf, int len, int from)
-{
-	struct mg_member *memb, *memb2;
-	struct gdlm_header *hd;
-	int *ids, count, i, nodeid, jid, opts;
-	int current_first_recover = 0;
-
-	hd = (struct gdlm_header *)buf;
-
-	count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int));
-	ids = (int *) (buf + sizeof(struct gdlm_header));
-
-	for (i = 0; i < count; i++) {
-		nodeid = le32_to_cpu(ids[i * NUM]);
-		jid    = le32_to_cpu(ids[i * NUM + 1]);
-		opts   = le32_to_cpu(ids[i * NUM + 2]);
-
-		log_debug("receive nodeid %d jid %d opts %x",
-			  nodeid, jid, opts);
-
-		memb = find_memb_nodeid(mg, nodeid);
-		memb2 = find_memb_jid(mg, jid);
-
-		if (!memb || memb2) {
-			log_error("invalid journals message "
-				  "nodeid %d jid %d opts %x",
-				  nodeid, jid, opts);
-		}
-		if (!memb)
-			continue;
-
-		memb->jid = jid;
-
-		if (nodeid == our_nodeid) {
-			mg->our_jid = jid;
-			/* set_our_memb_options() sets rest */
-			if (opts & MEMB_OPT_RECOVER)
-				memb->opts |= MEMB_OPT_RECOVER;
-		} else {
-			memb->opts = opts;
-			if (opts & MEMB_OPT_RO)
-				memb->readonly = 1;
-			else if (opts & MEMB_OPT_RW)
-				memb->rw = 1;
-			else if (opts & MEMB_OPT_SPECT)
-				memb->spectator = 1;
-		}
-
-		if (opts & MEMB_OPT_RECOVER)
-			current_first_recover = 1;
-	}
-
-	/* FIXME: use global_first_recover_done more widely instead of
-	   as a single special case */
-	if (!current_first_recover)
-		mg->global_first_recover_done = 1;
-
-	process_saved_mount_status(mg);
-
-	/* we delay processing any options messages from new mounters
-	   until after we receive the journals message for our own mount */
-
-	process_saved_options(mg);
-
-	received_our_jid(mg);
-}
-
-void receive_journals(struct mountgroup *mg, char *buf, int len, int from)
-{
-	struct gdlm_header *hd = (struct gdlm_header *)buf;
-	struct mg_member *memb;
-	int count;
-
-	count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int));
-
-	log_group(mg, "receive_journals from %d to %d len %d count %d cb %d",
-		  from, hd->to_nodeid, len, count, mg->last_callback);
-
-	/* just like we can receive an options msg from a newly added node
-	   before we get the start adding it, we can receive the journals
-	   message sent to it before we get the start adding it */
-
-	memb = find_memb_nodeid(mg, hd->to_nodeid);
-	if (!memb) {
-		log_group(mg, "receive_journals from %d to unknown %d",
-			  from, hd->to_nodeid);
-		return;
-	}
-	memb->needs_journals = 0;
-
-	if (hd->to_nodeid && hd->to_nodeid != our_nodeid)
-		return;
-
-	if (mg->got_our_journals) {
-		log_group(mg, "receive_journals from %d duplicate", from);
-		return;
-	}
-	mg->got_our_journals = 1;
-
-	_receive_journals(mg, buf, len, from);
-}
-
-static void add_ordered_member(struct mountgroup *mg, struct mg_member *new)
-{
-	struct mg_member *memb = NULL;
-	struct list_head *tmp;
-	struct list_head *newlist = &new->list;
-	struct list_head *head = &mg->members;
-
-	list_for_each(tmp, head) {
-		memb = list_entry(tmp, struct mg_member, list);
-		if (new->nodeid < memb->nodeid)
-			break;
-	}
-
-	if (!memb)
-		list_add_tail(newlist, head);
-	else {
-		/* FIXME: can use list macro here */
-		newlist->prev = tmp->prev;
-		newlist->next = tmp;
-		tmp->prev->next = newlist;
-		tmp->prev = newlist;
-	}
-}
-
-int add_member(struct mountgroup *mg, int nodeid)
-{
-	struct mg_member *memb;
-
-	memb = malloc(sizeof(struct mg_member));
-	if (!memb)
-		return -ENOMEM;
-
-	memset(memb, 0, sizeof(*memb));
-
-	memb->nodeid = nodeid;
-	memb->jid = JID_INIT;
-	add_ordered_member(mg, memb);
-	mg->memb_count++;
-
-	if (!mg->init)
-		memb->needs_journals = 1;
-
-	return 0;
-}
-
-int is_member(struct mountgroup *mg, int nodeid)
-{
-	struct mg_member *memb;
-
-	list_for_each_entry(memb, &mg->members, list) {
-		if (memb->nodeid == nodeid)
-			return TRUE;
-	}
-	return FALSE;
-}
-
-int is_removed(struct mountgroup *mg, int nodeid)
-{
-	struct mg_member *memb;
-
-	list_for_each_entry(memb, &mg->members_gone, list) {
-		if (memb->nodeid == nodeid)
-			return TRUE;
-	}
-	return FALSE;
-}
-
-static void clear_memb_list(struct list_head *head)
-{
-	struct mg_member *memb;
-
-	while (!list_empty(head)) {
-		memb = list_entry(head->next, struct mg_member, list);
-		list_del(&memb->list);
-		free(memb);
-	}
-}
-
-void clear_members(struct mountgroup *mg)
-{
-	clear_memb_list(&mg->members);
-	mg->memb_count = 0;
-}
-
-void clear_members_gone(struct mountgroup *mg)
-{
-	clear_memb_list(&mg->members_gone);
-}
-
-/* New mounters may be waiting for a journals message that a failed node (as
-   master) would have sent.  If the master failed and we're the new master,
-   then send a journals message to any nodes for whom we've not seen a journals
-   message.  We also need to checkpoint the plock state for the new nodes to
-   read after they get their journals message. */
-
-void resend_journals(struct mountgroup *mg)
-{
-	struct mg_member *memb;
-	int stored_plocks = 0;
-
-	list_for_each_entry(memb, &mg->members, list) {
-		if (!memb->needs_journals)
-			continue;
-
-		if (!stored_plocks) {
-			store_plocks(mg, memb->nodeid);
-			stored_plocks = 1;
-		}
-
-		log_group(mg, "resend_journals to %d", memb->nodeid);
-		send_journals(mg, memb->nodeid);
-	}
-}
-
-/* The master node is the member of the group with the lowest nodeid who
-   was also a member of the last "finished" group, i.e. a member of the
-   group the last time it got a finish callback.  The job of the master
-   is to send state info to new nodes joining the group, and doing that
-   requires that the master has all the state to send -- a new joining
-   node that has the lowest nodeid doesn't have any state, which is why
-   we add the "finished" requirement. */
-
-void update_master_nodeid(struct mountgroup *mg)
-{
-	struct mg_member *memb;
-	int new = -1, low = -1;
-
-	list_for_each_entry(memb, &mg->members, list) {
-		if (low == -1 || memb->nodeid < low)
-			low = memb->nodeid;
-		if (!memb->finished)
-			continue;
-		if (new == -1 || memb->nodeid < new)
-			new = memb->nodeid;
-	}
-	mg->master_nodeid = new;
-	mg->low_nodeid = low;
-}
-
-/* This can happen before we receive a journals message for our mount. */
-
-void recover_members(struct mountgroup *mg, int num_nodes,
- 		     int *nodeids, int *pos_out, int *neg_out)
-{
-	struct mg_member *memb, *safe, *memb_gone_recover = NULL;
-	int i, found, id, pos = 0, neg = 0, prev_master_nodeid;
-	int master_failed = 0;
-
-	/* move departed nodes from members list to members_gone */
-
-	list_for_each_entry_safe(memb, safe, &mg->members, list) {
-		found = FALSE;
-		for (i = 0; i < num_nodes; i++) {
-			if (memb->nodeid == nodeids[i]) {
-				found = TRUE;
-				break;
-			}
-		}
-
-		if (!found) {
-			neg++;
-
-			list_move(&memb->list, &mg->members_gone);
-			memb->gone_event = mg->start_event_nr;
-			memb->gone_type = mg->start_type;
-			mg->memb_count--;
-
-			memb->tell_gfs_to_recover = 0;
-			memb->recovery_status = 0;
-			memb->local_recovery_status = 0;
-
-			/* - journal cb for failed or withdrawing nodes
-			   - failed node was assigned a journal
-			   - no journal cb if failed node was spectator
-			   - no journal cb if we've already done a journl cb */
-
-			if ((memb->gone_type == GROUP_NODE_FAILED ||
-			    memb->withdrawing) &&
-			    memb->jid != JID_INIT &&
-			    memb->jid != -2 &&
-			    !memb->spectator &&
-			    !memb->wait_gfs_recover_done) {
-				memb->tell_gfs_to_recover = 1;
-				memb->recovery_status = RS_NEED_RECOVERY;
-				memb->local_recovery_status = RS_NEED_RECOVERY;
-			}
-
-			log_group(mg, "remove member %d tell_gfs_to_recover %d "
-				  "(%d,%d,%d,%d,%d,%d)",
-				  memb->nodeid, memb->tell_gfs_to_recover,
-				  mg->spectator,
-				  mg->start_type,
-				  memb->withdrawing,
-				  memb->jid,
-				  memb->spectator,
-				  memb->wait_gfs_recover_done);
-
-			if (mg->master_nodeid == memb->nodeid &&
-			    memb->gone_type == GROUP_NODE_FAILED)
-				master_failed = 1;
-
-			if (memb->opts & MEMB_OPT_RECOVER)
-				memb_gone_recover = memb;
-		}
-	}	
-
-	/* add new nodes to members list */
-
-	for (i = 0; i < num_nodes; i++) {
-		id = nodeids[i];
-		if (is_member(mg, id))
-			continue;
-		add_member(mg, id);
-		pos++;
-		log_group(mg, "add member %d", id);
-	}
-
-	prev_master_nodeid = mg->master_nodeid;
-	update_master_nodeid(mg);
-
-	*pos_out = pos;
-	*neg_out = neg;
-
-	log_group(mg, "total members %d master_nodeid %d prev %d",
-		  mg->memb_count, mg->master_nodeid, prev_master_nodeid);
-
-
-	/* The master failed and we're the new master, we need to:
-
-	   - unlink the ckpt that the failed master had open so new ckpts
-	     can be created down the road
-	   - resend journals msg to any nodes that needed one from the
-	     failed master
-	   - store plocks in ckpt for the new mounters to read when they
-	     get the journals msg from us */
-
-	if (neg && master_failed &&
-	    (prev_master_nodeid != -1) &&
-	    (prev_master_nodeid != mg->master_nodeid) &&
-	    (our_nodeid == mg->master_nodeid)) {
-		log_group(mg, "unlink ckpt for failed master %d",
-			  prev_master_nodeid);
-		unlink_checkpoint(mg);
-		resend_journals(mg);
-	}
-
-	/* Do we need a new first mounter?
-
-	   If we've not gotten a journals message yet (implies we're mounting)
-	   and there's only one node left in the group (us, after removing the
-	   failed node), then it's possible that the failed node was doing
-	   first mounter recovery, so we need to become first mounter.
-
-	   If we've received a journals message, we can check if the failed
-	   node was doing first mounter recovery (MEMB_OPT_RECOVER set) and
-	   if so select the next first mounter. */
-
-	if (!neg)
-		return;
-
-	if (!mg->got_our_journals && mg->memb_count == 1) {
-		log_group(mg, "we are left alone, act as first mounter");
-		unlink_checkpoint(mg);
-		memb = find_memb_nodeid(mg, our_nodeid);
-		memb->jid = 0;
-		memb->opts |= MEMB_OPT_RECOVER;
-		mg->our_jid = 0;
-		mg->first_mounter = 1;
-		mg->first_mounter_done = 0;
-		mg->got_our_options = 1;
-		mg->got_our_journals = 1;
-		mg->mount_client_delay = 0;
-		notify_mount_client(mg);
-		return;
-	}
-
-	if (memb_gone_recover) {
-		log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
-			  memb_gone_recover->nodeid);
-		memb_gone_recover->tell_gfs_to_recover = 0;
-	}
-
-	if (memb_gone_recover && mg->got_our_journals) {
-		assign_next_first_mounter(mg);
-		memb = find_memb_nodeid(mg, our_nodeid);
-		if (memb->opts & MEMB_OPT_RECOVER) {
-			log_group(mg, "first mounter failed, we get "
-				  "MEMB_OPT_RECOVER");
-			unlink_checkpoint(mg);
-			memb->opts |= MEMB_OPT_RECOVER;
-			mg->first_mounter = 1;
-			mg->first_mounter_done = 0;
-			mg->mount_client_delay = 0;
-			notify_mount_client(mg);
-		}
-	}
-}
-
-struct mountgroup *create_mg(char *name, char *dir)
-{
-	struct mountgroup *mg;
-	struct mountpoint *mp;
-
-	mg = malloc(sizeof(struct mountgroup));
-	if (!mg)
-		return NULL;
-	memset(mg, 0, sizeof(struct mountgroup));
-
-	INIT_LIST_HEAD(&mg->members);
-	INIT_LIST_HEAD(&mg->members_gone);
-	INIT_LIST_HEAD(&mg->resources);
-	INIT_LIST_HEAD(&mg->saved_messages);
-	INIT_LIST_HEAD(&mg->mountpoints);
-	mg->init = 1;
-	mg->master_nodeid = -1;
-	mg->low_nodeid = -1;
-
-	strncpy(mg->name, name, MAXNAME);
-
-	mp = malloc(sizeof(struct mountpoint));
-	if (!mp) {
-		free(mg);
-		return NULL;
-	}
-	memset(mp, 0, sizeof(struct mountpoint));
-	strncpy(mp->dir, dir, sizeof(mp->dir));
-	list_add(&mp->list, &mg->mountpoints);
-
-	return mg;
-}
-
-struct mountgroup *find_mg(char *name)
-{
-	struct mountgroup *mg;
-
-	list_for_each_entry(mg, &mounts, list) {
-		if ((strlen(mg->name) == strlen(name)) &&
-		    !strncmp(mg->name, name, strlen(name)))
-			return mg;
-	}
-	return NULL;
-}
-
-struct mountgroup *find_mg_id(uint32_t id)
-{
-	struct mountgroup *mg;
-
-	list_for_each_entry(mg, &mounts, list) {
-		if (mg->id == id)
-			return mg;
-	}
-	return NULL;
-}
-
-struct mountpoint *find_mountpoint(struct mountgroup *mg, char *dir)
-{
-	struct mountpoint *mp;
-
-	list_for_each_entry(mp, &mg->mountpoints, list) {
-		if (!strcmp(mp->dir, dir))
-			return mp;
-	}
-	return NULL;
-}
-
-struct mountgroup *find_mg_dir(char *dir)
-{
-	struct mountgroup *mg;
-
-	list_for_each_entry(mg, &mounts, list) {
-		if (find_mountpoint(mg, dir))
-			return mg;
-	}
-	return NULL;
-}
-
-static int we_are_in_fence_domain(void)
-{
-	group_data_t data;
-	int rv;
-
-	memset(&data, 0, sizeof(data));
-
-	rv = group_get_group(0, "default", &data);
-
-	if (rv || strcmp(data.client_name, "fence"))
-		return 0;
-
-	if (data.member == 1)
-		return 1;
-	return 0;
-}
-
-int add_another_mountpoint(struct mountgroup *mg, char *dir, char *dev, int ci)
-{
-	struct mountpoint *mp;
-
-	log_group(mg, "add_another_mountpoint dir %s dev %s ci %d",
-		  dir, dev, ci);
-
-	/* check if this is the same fs mounted on another dir or a different
-	   fs with the same name (which is an error) */
-
-	if (strcmp(mg->dev, dev)) {
-		log_error("different fs dev %s with same name", mg->dev);
-		return -EADDRINUSE;
-	}
-
-	if (find_mountpoint(mg, dir)) {
-		log_error("mount point %s already used", dir);
-		return -EBUSY;
-	}
-
-	/* we only really need to check one of these */
-	if (mg->mount_client || mg->mount_client_fd || !mg->kernel_mount_done) {
-		log_error("other mount in progress client %d fd %d done %d",
-			  mg->mount_client, mg->mount_client_fd,
-			  mg->kernel_mount_done);
-		return -EBUSY;
-	}
-
-	mp = malloc(sizeof(struct mountpoint));
-	if (!mp)
-		return -ENOMEM;
-
-	memset(mp, 0, sizeof(struct mountpoint));
-	strncpy(mp->dir, dir, sizeof(mp->dir));
-	list_add(&mp->list, &mg->mountpoints);
-	mg->mount_client = ci;
-
-	/* we return this special error to mount.gfs which mount.gfs will
-	   recognize as meaning the fs is already mounted, so it shouldn't
-	   read any hostdata from us, but just go ahead and mount(2) */
-
-	return -EALREADY;
-}
-
-int do_mount(int ci, char *dir, char *type, char *proto, char *table,
-	     char *options, char *dev, struct mountgroup **mg_ret)
-{
-	struct mountgroup *mg = NULL;
-	char table2[MAXLINE];
-	char *cluster = NULL, *name = NULL;
-	int rv, new_mg = 0;
-
-	log_debug("mount: %s %s %s %s %s %s",
-		  dir, type, proto, table, options, dev);
-
-	if (strcmp(proto, "lock_dlm")) {
-		log_error("mount: lockproto %s not supported", proto);
-		rv = -EPROTONOSUPPORT;
-		goto out;
-	}
-
-	if (strstr(options, "jid=") ||
-	    strstr(options, "first=") ||
-	    strstr(options, "id=")) {
-		log_error("mount: jid, first and id are reserved options");
-		rv = -EOPNOTSUPP;
-		goto out;
-	}
-
-	/* table is <cluster>:<name> */
-
-	memset(&table2, 0, MAXLINE);
-	strncpy(table2, table, MAXLINE);
-
-	name = strstr(table2, ":");
-	if (!name) {
-		rv = -EBADFD;
-		goto out;
-	}
-
-	*name = '\0';
-	name++;
-	cluster = table2;
-
-	if (strlen(name) > MAXNAME) {
-		rv = -ENAMETOOLONG;
-		goto out;
-	}
-
-	mg = find_mg(name);
-	if (mg) {
-		if (mg->reject_mounts) {
-			/* fs is being unmounted */
-			rv = -ESTALE;
-			log_error("mount: reject mount due to unmount");
-		} else {
-			rv = add_another_mountpoint(mg, dir, dev, ci);
-		}
-		goto out;
-	}
-
-	mg = create_mg(name, dir);
-	if (!mg) {
-		rv = -ENOMEM;
-		goto out;
-	}
-	new_mg = 1;
-
-	mg->mount_client = ci;
-	strncpy(mg->type, type, sizeof(mg->type));
-	strncpy(mg->table, table, sizeof(mg->table));
-	strncpy(mg->options, options, sizeof(mg->options));
-	strncpy(mg->dev, dev, sizeof(mg->dev));
-
-	if (strlen(cluster) != strlen(clustername) ||
-	    strlen(cluster) == 0 || strcmp(cluster, clustername)) {
-		rv = -EBADR;
-		log_error("mount: fs requires cluster=\"%s\" current=\"%s\"",
-			  cluster, clustername);
-		goto out;
-	} else
-		log_group(mg, "cluster name matches: %s", clustername);
-
-	if (strstr(options, "spectator")) {
-		log_group(mg, "spectator mount");
-		mg->spectator = 1;
-	} else {
-		if (!we_are_in_fence_domain()) {
-			rv = -ENOANO;
-			log_error("mount: not in default fence domain");
-			goto out;
-		}
-	}
-
-	if (!mg->spectator && strstr(options, "rw"))
-		mg->rw = 1;
-	else if (strstr(options, "ro")) {
-		if (mg->spectator) {
-			rv = -EROFS;
-			log_error("mount: readonly invalid with spectator");
-			goto out;
-		}
-		mg->readonly = 1;
-	}
-
-	if (strlen(options) > MAX_OPTIONS_LEN-1) {
-		rv = -EMLINK;
-		log_error("mount: options too long %zu", strlen(options));
-		goto out;
-	}
-
-	list_add(&mg->list, &mounts);
-	group_join(gh, name);
-	rv = 0;
- out:
-	if (mg) {
-		*mg_ret = mg;
-		log_group(mg, "do_mount: rv %d", rv);
-	}
-	if (rv && new_mg)
-		free(mg);
-	return rv;
-}
-
-/* recover_members() discovers which nodes need journal recovery
-   and moves the memb structs for those nodes into members_gone
-   and sets memb->tell_gfs_to_recover on them */
-
-/* we don't want to tell gfs-kernel to do journal recovery for a failed
-   node in a number of cases:
-   - we're a spectator or readonly mount
-   - gfs-kernel is currently withdrawing
-   - we're mounting and haven't received a journals message yet
-   - we're mounting and got a kernel mount error back from mount.gfs
-   - we're mounting and haven't notified mount.gfs yet (to do mount(2))
-   - we're mounting and got_kernel_mount is 0, i.e. we've not seen a uevent
-     related to the kernel mount yet
-   (some of the mounting checks should be obviated by others)
-
-   the problem we're trying to avoid here is telling gfs-kernel to do
-   recovery when it can't for some reason and then waiting forever for
-   a recovery_done signal that will never arrive. */
- 
-void recover_journals(struct mountgroup *mg)
-{
-	struct mg_member *memb;
-	int rv;
-
-	if (mg->spectator ||
-	    mg->readonly ||
-	    mg->withdraw ||
-	    mg->our_jid == JID_INIT ||
-	    mg->kernel_mount_error ||
-	    !mg->mount_client_notified ||
-	    !mg->got_kernel_mount ||
-	    !mg->kernel_mount_done) {
-		log_group(mg, "recover_journals: unable %d,%d,%d,%d,%d,%d,%d,%d",
-			  mg->spectator,
-			  mg->readonly,
-			  mg->withdraw,
-			  mg->our_jid,
-			  mg->kernel_mount_error,
-			  mg->mount_client_notified,
-			  mg->got_kernel_mount,
-			  mg->kernel_mount_done);
-
-		list_for_each_entry(memb, &mg->members_gone, list) {
-			log_group(mg, "member gone %d jid %d "
-				  "tell_gfs_to_recover %d",
-				  memb->nodeid, memb->jid,
-				  memb->tell_gfs_to_recover);
-
-			if (memb->tell_gfs_to_recover) {
-				memb->tell_gfs_to_recover = 0;
-				memb->local_recovery_status = RS_READONLY;
-			}
-		}
-		start_done(mg);
-		return;
-	}
-
-	/* we feed one jid into the kernel for recovery instead of all
-	   at once because we need to get the result of each independently
-	   through the single recovery_done sysfs file */
-
-	list_for_each_entry(memb, &mg->members_gone, list) {
-		if (memb->wait_gfs_recover_done) {
-			log_group(mg, "delay new gfs recovery, "
-			  	  "wait_gfs_recover_done for nodeid %d jid %d",
-			  	  memb->nodeid, memb->jid);
-			return;
-		}
-	}
-
-	list_for_each_entry(memb, &mg->members_gone, list) {
-		if (!memb->tell_gfs_to_recover)
-			continue;
-
-		log_group(mg, "recover journal %d nodeid %d",
-			  memb->jid, memb->nodeid);
-
-		rv = set_sysfs(mg, "recover", memb->jid);
-		if (rv < 0) {
-			memb->local_recovery_status = RS_NOFS;
-			continue;
-		}
-		memb->tell_gfs_to_recover = 0;
-		memb->wait_gfs_recover_done = 1;
-		return;
-	}
-
-	/* no more journals to attempt to recover, if we've been successful
-	   recovering any then send out status, if not then start_done...
-	   receiving no status message from us before start_done means we
-	   didn't successfully recover any journals.  If we send out status,
-	   then delay start_done until we get our own message (so all nodes
-	   will get the status before finish) */
-
-	list_for_each_entry(memb, &mg->members_gone, list) {
-		if (memb->local_recovery_status == RS_SUCCESS) {
-			send_recovery_status(mg);
-			log_group(mg, "delay start_done until status recvd");
-			return;
-		}
-	}
-
-	start_done(mg);
-}
-
-/* In some cases, we may be joining a mountgroup with needs_recovery
-   set (there are journals that need recovery and current members can't
-   recover them because they're ro).  In this case, we're told to act
-   like the first mounter to cause gfs to try to recovery all journals
-   when it mounts.  When gfs does this, we'll get recovery_done's for
-   the individual journals it recovers (ignored) and finally, if all
-   journals are ok, an others_may_mount/first_done. */
- 
-/* When gfs does first-mount recovery, the mount(2) fails if it can't
-   recover one of the journals.  If we get o_m_m, then we know it was
-   able to successfully recover all the journals. */
-
-/* When we're the first mounter, gfs does recovery on all the journals
-   and does "recovery_done" callbacks when it finishes each.  We ignore
-   these and wait for gfs to be finished with all at which point it calls
-   others_may_mount() and first_done is set. */
-
-int kernel_recovery_done_first(struct mountgroup *mg)
-{
-	char buf[MAXLINE];
-	int rv, first_done;
-
-	memset(buf, 0, sizeof(buf));
-
-	rv = get_sysfs(mg, "first_done", buf, sizeof(buf));
-	if (rv < 0)
-		return rv;
-
-	first_done = atoi(buf);
-
-	log_group(mg, "kernel_recovery_done_first first_done %d", first_done);
-
-	if (mg->kernel_mount_done)
-		log_group(mg, "FIXME: assuming kernel_mount_done comes after "
-			  "first_done");
-
-	if (first_done) {
-		mg->first_mounter_done = 1;
-		send_recovery_done(mg);
-	}
-
-	return 0;
-}
-
-int need_kernel_recovery_done(struct mountgroup *mg)
-{
-	struct mg_member *memb;
-
-	list_for_each_entry(memb, &mg->members_gone, list) {
-		if (memb->wait_gfs_recover_done)
-			return 1;
-	}
-	return 0;
-}
-
-/* Note: when a readonly node fails we do consider its journal (and the
-   fs) to need recovery... not sure this is really necessary, but
-   the readonly node did "own" a journal so it seems proper to recover
-   it even if the node wasn't writing to it.  So, if there are 3 ro
-   nodes mounting the fs and one fails, gfs on the remaining 2 will
-   remain blocked until an rw node mounts, and the next mounter must
-   be rw. */
-
-int kernel_recovery_done(char *table)
-{
-	struct mountgroup *mg;
-	struct mg_member *memb;
-	char buf[MAXLINE];
-	char *ss, *name = strstr(table, ":") + 1;
-	int rv, jid_done, found = 0;
-
-	mg = find_mg(name);
-	if (!mg) {
-		log_error("recovery_done: unknown mount group %s", table);
-		return -1;
-	}
-
-	if (mg->first_mounter && !mg->first_mounter_done)
-		return kernel_recovery_done_first(mg);
-
-	memset(buf, 0, sizeof(buf));
-
-	rv = get_sysfs(mg, "recover_done", buf, sizeof(buf));
-	if (rv < 0)
-		return rv;
-	jid_done = atoi(buf);
-
-	list_for_each_entry(memb, &mg->members_gone, list) {
-		if (memb->jid == jid_done) {
-			if (memb->wait_gfs_recover_done) {
-				memb->wait_gfs_recover_done = 0;
-				found = 1;
-			}
-			break;
-		}
-	}
-
-	/* We need to ignore recovery_done callbacks in the case where there
-	   are a bunch of recovery_done callbacks for the first mounter, but
-	   we detect "first_done" before we've processed all the
-	   recovery_done's. */
-
-	if (!found) {
-		log_group(mg, "recovery_done jid %d ignored, first %d,%d",
-			  jid_done, mg->first_mounter, mg->first_mounter_done);
-		return 0;
-	}
-
-	memset(buf, 0, sizeof(buf));
-
-	rv = get_sysfs(mg, "recover_status", buf, sizeof(buf));
-	if (rv < 0) {
-		log_group(mg, "recovery_done jid %d nodeid %d sysfs error %d",
-			  memb->jid, memb->nodeid, rv);
-		memb->local_recovery_status = RS_NOFS;
-		goto out;
-	}
-
-	switch (atoi(buf)) {
-	case LM_RD_GAVEUP:
-		/*
-		 * This is unfortunate; it's needed for bz 442451 where
-		 * gfs-kernel fails to acquire the journal lock on all nodes
-		 * because a withdrawing node has not yet called
-		 * dlm_release_lockspace() to free it's journal lock.  With
-		 * this, all nodes should repeatedly try to to recover the
-		 * journal of the withdrawn node until the withdrawing node
-		 * clears its dlm locks, and gfs on each of the remaining nodes
-		 * succeeds in doing the recovery.
-		 */
-
-		if (memb->withdrawing) {
-			log_group(mg, "recovery_done jid %d nodeid %d retry "
-				  "for withdraw", memb->jid, memb->nodeid);
-			memb->tell_gfs_to_recover = 1;
-			memb->wait_gfs_recover_done = 0;
-			usleep(500000);
-		}
-
-		memb->local_recovery_status = RS_GAVEUP;
-		ss = "gaveup";
-		break;
-	case LM_RD_SUCCESS:
-		memb->local_recovery_status = RS_SUCCESS;
-		ss = "success";
-		break;
-	default:
-		log_error("recovery_done: jid %d nodeid %d unknown status %d",
-			  memb->jid, memb->nodeid, atoi(buf));
-		ss = "unknown";
-	}
-
-	log_group(mg, "recovery_done jid %d nodeid %d %s",
-		  memb->jid, memb->nodeid, ss);
-
-	/* sanity check */
-	if (need_kernel_recovery_done(mg))
-		log_error("recovery_done: should be no pending gfs recoveries");
-
- out:
-	recover_journals(mg);
-	return 0;
-}
-
-int do_remount(int ci, char *dir, char *mode)
-{
-	struct mountgroup *mg;
-	int ro = 0, rw = 0;;
-
-	if (!strncmp(mode, "ro", 2))
-		ro = 1;
-	else
-		rw = 1;
-
-	mg = find_mg_dir(dir);
-	if (!mg) {
-		log_error("do_remount: remount mount dir %s", dir);
-		return -1;
-	}
-
-	/* no change */
-	if ((mg->readonly && ro) || (mg->rw && rw))
-		return 1;
-
-	mg->remount_client = ci;
-	send_remount(mg, ro);
-	return 0;
-}
-
-int do_unmount(int ci, char *dir, int mnterr)
-{
-	struct mountgroup *mg;
-	struct mountpoint *mp;
-
-	list_for_each_entry(mg, &withdrawn_mounts, list) {
-		mp = find_mountpoint(mg, dir);
-		if (!mp)
-			continue;
-		log_group(mg, "unmount %s for withdrawn fs", dir);
-		list_del(&mp->list);
-		free(mp);
-		if (list_empty(&mg->mountpoints)) {
-			list_del(&mg->list);
-			free(mg);
-		}
-		return 0;
-	}
-
-	mg = find_mg_dir(dir);
-	if (!mg) {
-		log_error("do_unmount: unknown mount dir %s", dir);
-		return -1;
-	}
-
-	if (mnterr) {
-		log_group(mg, "do_unmount: kernel mount error %d", mnterr);
-
-		/* sanity check: we should already have gotten the error from
-		   the mount_result message sent by mount.gfs */
-		if (!mg->kernel_mount_error) {
-			log_group(mg, "do_unmount: mount_error is new %d %d",
-				  mg->kernel_mount_error, mnterr);
-			mg->kernel_mount_error = mnterr;
-			mg->kernel_mount_done = 1;
-		}
-		goto out;
-	}
-
-	if (mg->withdraw) {
-		log_error("%s do_unmount: fs on %s is withdrawing",
-			  mg->name, dir);
-		return -1;
-	}
-
-	if (!mg->kernel_mount_done) {
-		log_error("%s do_unmount: fs on %s is still mounting",
-			  mg->name, dir);
-		return -1;
-	}
-
-	mp = find_mountpoint(mg, dir);
-	ASSERT(mp);
-	list_del(&mp->list);
-	free(mp);
-
-	if (!list_empty(&mg->mountpoints)) {
-		log_group(mg, "removed mountpoint %s, more remaining", dir);
-		return 0;
-	}
-
-	/* Check to see if we're waiting for a kernel recovery_done to do a
-	   start_done().  If so, call the start_done() here because we won't be
-	   getting anything else from gfs-kernel which is now gone. */
-
-	if (need_kernel_recovery_done(mg)) {
-		log_group(mg, "do_unmount: fill in start_done");
-		start_done(mg);
-	}
-
- out:
-	mg->reject_mounts = 1;
-	group_leave(gh, mg->name);
-	return 0;
-}
-
-#define LOCKSPACE_NODIR "/cluster/dlm/lockspace[@name=\"%s\"]/@nodir"
-
-void notify_mount_client(struct mountgroup *mg)
-{
-	char buf[MAXLINE], path[PATH_MAX], *str, tmp[MAXLINE];
-	int cd, rv, error = 0;
-	struct mg_member *memb;
-
-	memb = find_memb_nodeid(mg, our_nodeid);
-
-	memset(buf, 0, MAXLINE);
-	memset(tmp, 0, MAXLINE);
-
-	if (mg->error_msg[0]) {
-		strncpy(buf, mg->error_msg, MAXLINE);
-		error = 1;
-	} else {
-		if (mg->mount_client_delay) {
-			log_group(mg, "notify_mount_client delayed");
-			return;
-		}
-
-		if (mg->our_jid < 0)
-			snprintf(buf, MAXLINE, "hostdata=id=%u:first=%d",
-		 		 mg->id, mg->first_mounter);
-		else
-			snprintf(buf, MAXLINE, "hostdata=jid=%d:id=%u:first=%d",
-		 		 mg->our_jid, mg->id, mg->first_mounter);
-
-		if ((cd = ccs_connect()) < 0) {
-			log_error("notify_mount_client: ccs_connect failed");
-		}
-
-		memset(path, 0, PATH_MAX);
-		sprintf(path, LOCKSPACE_NODIR, mg->name);
-
-		rv = ccs_get(cd, path, &str);
-		if (rv || !str) {
-			log_debug("notify_mount_client: nodir not found for "
-				  "lockspace %s", mg->name);
-		} else {
-			snprintf(tmp, MAXLINE, ":nodir=%d", atoi(str));
-			strcat(buf, tmp);
-			free(str);
-		}
-
-		if (cd) {
-			log_debug("notify_mount_client: ccs_disconnect");
-			ccs_disconnect(cd);
-		}
-	}
-
-	log_debug("notify_mount_client: %s", buf);
-
-	rv = client_send(mg->mount_client, buf, MAXLINE);
-	if (rv < 0)
-		log_error("notify_mount_client: send failed %d", rv);
-
-	if (error) {
-		log_group(mg, "leaving due to mount error: %s", mg->error_msg);
-		if (memb->finished)
-			group_leave(gh, mg->name);
-		else {
-			log_group(mg, "delay leave until after join");
-			mg->group_leave_on_finish = 1;
-		}
-	} else {
-		mg->mount_client_notified = 1;
-	}
-}
-
-void ping_kernel_mount(char *table)
-{
-	struct mountgroup *mg;
-	char buf[MAXLINE];
-	char *name = strstr(table, ":") + 1;
-	int rv;
-
-	mg = find_mg(name);
-	if (!mg)
-		return;
-
-	rv = get_sysfs(mg, "id", buf, sizeof(buf));
-
-	log_group(mg, "ping_kernel_mount %d", rv);
-}
-
-/* remove the mountpoint that this client added */
-void remove_failed_mountpoint(struct mountgroup *mg, int ci)
-{
-	struct mountpoint *mp;
-	int found = 0;
-
-	list_for_each_entry(mp, &mg->mountpoints, list) {
-		if (mp->client == ci) {
-			list_del(&mp->list);
-			free(mp);
-			found = 1;
-			break;
-		}
-	}
-	ASSERT(found);
-	ASSERT(!list_empty(&mg->mountpoints));
-}
-
-void got_mount_result(struct mountgroup *mg, int result, int ci, int another)
-{
-	struct mg_member *memb;
-
-	memb = find_memb_nodeid(mg, our_nodeid);
-
-	log_group(mg, "got_mount_result: ci %d result %d another %d "
-		  "first_mounter %d opts %x",
-		  ci, result, another, mg->first_mounter, memb->opts);
-
-	mg->mount_client = 0;
-	mg->mount_client_fd = 0;
-
-	if (another) {
-		if (result)
-			remove_failed_mountpoint(mg, ci);
-		return;
-	}
-
-	mg->kernel_mount_done = 1;
-	mg->kernel_mount_error = result;
-
-	send_mount_status(mg);
-}
-
-/* When mounting a fs, we first join the mountgroup, then tell mount.gfs
-   to procede with the kernel mount.  Once we're in the mountgroup, we
-   can get a stop callback at any time, which requires us to block the
-   fs by setting a sysfs file.  If the kernel mount is slow, we can get
-   a stop callback and try to set the sysfs file before the kernel mount
-   has actually created the sysfs files for the fs.  This function delays
-   any further processing until the sysfs files exist. */
-
-/* This function returns 0 when the kernel mount is successfully detected
-   and we know that do_stop() will be able to block the fs.
-   This function returns a negative error if it detects the kernel mount
-   has failed which means there's nothing to stop and do_stop() can assume
-   an implicit stop. */
-
-int wait_for_kernel_mount(struct mountgroup *mg)
-{
-	char buf[MAXLINE], cmd[32], dir[PATH_MAX], type[32];
-	int rv, result;
-
-	while (1) {
-		rv = get_sysfs(mg, "id", buf, sizeof(buf));
-		if (!rv)
-			break;
-		usleep(100000);
-
-		/* If mount.gfs reports an error from mount(2), it means
-		   the mount failed and we don't need to block gfs (and
-		   we're not going to get any sysfs files).
-		   If mount.gfs reports successful mount(2), it means
-		   we need to wait for sysfs files to appear so we can
-		   block gfs for the stop */
-
-		if (mg->kernel_mount_done)
-			continue;
-
-		memset(buf, 0, sizeof(buf));
-
-		rv = read(mg->mount_client_fd, buf, sizeof(buf));
-		if (rv > 0) {
-			log_group(mg, "wait_for_kernel_mount: %s", buf);
-
-			memset(cmd, 0, sizeof(cmd));
-			memset(dir, 0, sizeof(dir));
-			memset(type, 0, sizeof(type));
-
-			rv = sscanf(buf, "%s %s %s %d",
-				    cmd, dir, type, &result);
-			if (rv < 4) {
-				log_error("bad mount_result args %d \"%s\"",
-					  rv, buf);
-				continue;
-			}
-
-			if (strncmp(cmd, "mount_result", 12)) {
-				log_error("bad mount_result \"%s\"", buf);
-				continue;
-			}
-
-			log_group(mg, "mount_result: kernel_mount_error %d",
-				  result);
-
-			mg->kernel_mount_done = 1;
-			mg->kernel_mount_error = result;
-
-			send_mount_status(mg);
-
-			if (result < 0) {
-				rv = result;
-				break;
-			}
-
-			/* if result is 0 then the mount(2) was successful
-			   and we expect to successfully get the "id"
-			   sysfs file very shortly */
-		}
-	}
-
-	return rv;
-}
-
-/* The processing of new mounters (send/recv options, send/recv journals,
-   notify mount.gfs) is not very integrated with the stop/start/finish
-   callbacks from libgroup.  A start callback just notifies us of a new
-   mounter and the options/journals messages drive things from there.
-   Recovery for failed nodes _is_ controlled more directly by the
-   stop/start/finish callbacks.  So, processing new mounters happens
-   independently of recovery and of the libgroup callbacks.  One place
-   where they need to intersect, though, is in stopping/suspending
-   gfs-kernel:
-   - When we get a stop callback, we need to be certain that gfs-kernel
-     is blocked.
-   - When a mounter notifies mount.gfs to go ahead, gfs-kernel will
-     shortly begin running in an unblocked fashion as it goes through
-     the kernel mounting process.
-   Given this, we need to be sure that if gfs-kernel is supposed to be
-   blocked, we don't notify mount.gfs to go ahead and do the kernel mount
-   since that starts gfs-kernel in an unblocked state. */
-
-/* - if we're unmounting, the kernel is gone, so no problem.
-   - if we've just mounted and notified mount.gfs, then wait for kernel
-     mount and then block.
-   - if we're mounting and have not yet notified mount.gfs, then set
-     a flag that delays the notification until block is set to 0. */
-
-int do_stop(struct mountgroup *mg)
-{
-	int rv;
-
-	if (mg->first_mounter && !mg->kernel_mount_done) {
-		log_group(mg, "do_stop skip during first mount recovery");
-		goto out;
-	}
-
-	for (;;) {
-		rv = set_sysfs(mg, "block", 1);
-		if (!rv)
-			break;
-
-		/* We get an error trying to block gfs, this could be due
-		   to a number of things:
-		   1. if the kernel instance of gfs existed before but now
-		      we can't see it, that must mean it's been unmounted,
-		      so it's implicitly stopped
-		   2. we're in the process of mounting and gfs hasn't created
-		      the sysfs files for this fs yet
-		   3. we're mounting and mount(2) returned an error
-		   4. we're mounting but haven't told mount.gfs to go ahead
-		      with mount(2) yet
-		   We also need to handle the situation where we get here in
-		   case 2 but it turns into case 3 while we're in
-		   wait_for_kernel_mount() */
-		   
-		if (mg->got_kernel_mount) {
-			log_group(mg, "do_stop skipped fs unmounted");
-			break;
-		}
-
-		if (mg->mount_client_notified) {
-			if (!mg->kernel_mount_error) {
-				log_group(mg, "do_stop wait for kernel mount");
-				rv = wait_for_kernel_mount(mg);
-				if (rv < 0)
-					break;
-			} else {
-				log_group(mg, "do_stop ignore, failed mount");
-				break;
-			}
-		} else {
-			log_group(mg, "do_stop causes mount_client_delay");
-			mg->mount_client_delay = 1;
-			break;
-		}
-	}
- out:
-	group_stop_done(gh, mg->name);
-	return 0;
-}
-
-/*  After a start that initiated a recovery, everyone will go and see if they
-    can do recovery and try if they can.  If a node can't, it does start_done,
-    if it tries and fails, it does start_done, if it tries and succeeds it
-    sends a message and then does start_done once it receives's it back.  So,
-    when we get a finish we know that we have all the results from the recovery
-    cycle and can judge if everything is recovered properly or not.  If so, we
-    can unblock locks (in the finish), if not, we leave them blocked (in the
-    finish).
-
-    If we leave locks blocked in the finish, then they can only be unblocked
-    after someone is able to do the recovery that's needed.  So, leaving locks
-    blocked in a finish because recovery hasn't worked puts us into a special
-    state: the fs needs recovery, none of the current mounters has been able to
-    recover it, all current mounters have locks blocked in gfs, new mounters
-    are allowed, nodes can unmount, new mounters are asked to do first-mounter
-    recovery, if one of them succeeds then we can all clear this special state
-    and unblock locks (the unblock would happen upon recving the success
-    message from the new pseudo-first mounter, not as part of a finish), future
-    finishes would then go back to being able to unblock locks.
-
-    While in this special state, a new node has been added and asked to do
-    first-mounter recovery, other nodes can also be added while the new
-    first-mounter is active.  These other nodes don't notify mount.gfs.
-    They'll receive the result of the first mounter and if it succeeded they'll
-    notify mount.gfs, otherwise one of them will become the next first-mounter
-    and notify mount.gfs. */
-
-int do_finish(struct mountgroup *mg)
-{
-	struct mg_member *memb, *safe;
-
-	log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
-		  mg->needs_recovery);
-
-	/* members_gone list are the members that were removed from the
-	   members list when processing a start.  members are removed
-	   from members_gone if their journals have been recovered */
-
-	list_for_each_entry_safe(memb, safe, &mg->members_gone, list) {
-		if (!memb->recovery_status) {
-			list_del(&memb->list);
-			free(memb);
-		} else if (memb->recovery_status == RS_SUCCESS) {
-			ASSERT(memb->gone_event <= mg->last_finish);
-			log_group(mg, "finish: recovered jid %d nodeid %d",
-				  memb->jid, memb->nodeid);
-			list_del(&memb->list);
-			free(memb);
-		} else {
-			log_error("%s finish: needs recovery jid %d nodeid %d "
-				  "status %d", mg->name, memb->jid,
-				  memb->nodeid, memb->recovery_status);
-			mg->needs_recovery = 1;
-		}
-	}
-
-	list_for_each_entry(memb, &mg->members, list)
-		memb->finished = 1;
-
-	if (mg->group_leave_on_finish) {
-		log_group(mg, "leaving group after delay for join to finish");
-		group_leave(gh, mg->name);
-		mg->group_leave_on_finish = 0;
-		return 0;
-	}
-
-	if (!mg->needs_recovery) {
-		set_sysfs(mg, "block", 0);
-
-		/* we may have been holding back our local mount due to
-		   being stopped/blocked */
-		if (mg->mount_client_delay && !first_mounter_recovery(mg)) {
-			mg->mount_client_delay = 0;
-			notify_mount_client(mg);
-		}
-	} else
-		log_group(mg, "finish: leave locks blocked for needs_recovery");
-
-	return 0;
-}
-
-/*
- * - require the first mounter to be rw, not ro or spectator.
- *
- * - if rw mounter fails, leaving only spectator mounters,
- * require the next mounter to be rw, more ro/spectator mounts should
- * fail until the fs is mounted rw.
- *
- * - if last rw mounter fails and ro mounters are left (possibly with
- * some spectators), disallow any ro->rw remounts, leave gfs blocked,
- * require next mounter to be rw, have next mounter do first mount
- * gfs/journal recovery.
- */
-
-/* called for the initial start on the node that's first to mount the fs.
-   (it should be ok to let the first mounter be a spectator, gfs should do
-   first recovery and bail out if there are any dirty journals) */
-
-/* FIXME: if journal recovery fails on any of the journals, we should
-   fail the mount */
-
-void start_first_mounter(struct mountgroup *mg)
-{
-	struct mg_member *memb;
-
-	log_group(mg, "start_first_mounter");
-	set_our_memb_options(mg);
-	memb = find_memb_nodeid(mg, our_nodeid);
-	ASSERT(memb);
-
-	if (mg->readonly || mg->spectator) {
-		memb->jid = -2;
-		mg->our_jid = -2;
-		log_group(mg, "start_first_mounter not rw ro=%d spect=%d",
-			  mg->readonly, mg->spectator);
-		strcpy(mg->error_msg, "error: first mounter must be read-write");
-	} else {
-		memb->opts |= MEMB_OPT_RECOVER;
-		memb->jid = 0;
-		mg->our_jid = 0;
-		mg->first_mounter = 1;
-		mg->first_mounter_done = 0;
-		mg->got_our_options = 1;
-		mg->got_our_journals = 1;
-	}
-	start_done(mg);
-	notify_mount_client(mg);
-}
-
-/* called for the initial start on a rw/ro mounter;
-   the existing mounters are running start_participant() */
-
-void start_participant_init(struct mountgroup *mg)
-{
-	log_group(mg, "start_participant_init");
-	set_our_memb_options(mg);
-	send_options(mg);
-	start_done(mg);
-}
-
-/* called for a non-initial start on a normal mounter.
-   NB we can get here without having received a journals message for
-   our (recent) mount yet in which case we don't know the jid or ro/rw
-   status of any members, and don't know our own jid. */
-
-void start_participant(struct mountgroup *mg, int pos, int neg)
-{
-	log_group(mg, "start_participant pos=%d neg=%d", pos, neg);
-
-	if (pos) {
-		start_done(mg);
-		/* we save options messages from nodes for whom we've not
-		   received a start yet */
-		process_saved_options(mg);
-	} else if (neg) {
-		recover_journals(mg);
-		process_saved_recovery_status(mg);
-	}
-}
-
-/* called for the initial start on a spectator mounter */
-
-void start_spectator_init(struct mountgroup *mg)
-{
-	log_group(mg, "start_spectator_init");
-	set_our_memb_options(mg);
-	send_options(mg);
-	start_done(mg);
-	mg->start2_fn = start_spectator_init_2;
-}
-
-/* called for the initial start on a spectator mounter,
-   after _receive_journals() */
-
-void start_spectator_init_2(struct mountgroup *mg)
-{
-	log_group(mg, "start_spectator_init_2 our_jid=%d", mg->our_jid);
-
-	/* we've been given jid of -2 which means we're not permitted
-	   to mount the fs; probably because the next mounter must be rw */
-
-	if (mg->our_jid == -2)
-		strcpy(mg->error_msg, "error: spectator mount not allowed");
-	else
-		ASSERT(mg->our_jid == -1);
-
-	notify_mount_client(mg);
-}
-
-/* called for a non-initial start on a spectator mounter */
-
-void start_spectator(struct mountgroup *mg, int pos, int neg)
-{
-	log_group(mg, "start_spectator pos=%d neg=%d", pos, neg);
-
-	if (pos) {
-		start_done(mg);
-		process_saved_options(mg);
-	} else if (neg) {
-		recover_journals(mg);
-		process_saved_recovery_status(mg);
-	}
-}
-
-/* If nodeA fails, nodeB is recovering journalA and nodeB fails before
-   finishing, then nodeC needs to tell gfs to recover both journalA and
-   journalB.  We do this by setting tell_gfs_to_recover back to 1 for
-   any nodes that are still on the members_gone list. */
-
-void reset_unfinished_recoveries(struct mountgroup *mg)
-{
-	struct mg_member *memb;
-
-	list_for_each_entry(memb, &mg->members_gone, list) {
-		if (memb->recovery_status &&
-		    memb->recovery_status != RS_NEED_RECOVERY) {
-			log_group(mg, "retry unfinished recovery "
-				  "jid %d nodeid %d",
-				  memb->jid, memb->nodeid);
-			memb->tell_gfs_to_recover = 1;
-			memb->recovery_status = RS_NEED_RECOVERY;
-			memb->local_recovery_status = RS_NEED_RECOVERY;
-		}
-	}
-}
-
-/*
-   old method:
-   A is rw mount, B mounts rw
-
-   do_start		do_start
-   start_participant	start_participant_init
-   			send_options
-   receive_options
-   start_participant_2
-   discover_journals
-   assign B a jid
-   send_journals
-   group_start_done
-   			receive_journals
-			start_participant_init_2
-			group_start_done
-   do_finish		do_finish
-
-   new method: decouples stop/start/finish from mount processing
-   A is rw mount, B mounts rw
-
-   do_start		do_start
-   start_participant	start_participant_init
-   start_done		send_options
-   			start_done
-   do_finish		do_finish
-
-   receive_options
-   assign_journal
-   send_journals
-   			receive_journals
-			start_participant_init_2
-			notify_mount_client
-*/
-
-void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids)
-{
-	int pos = 0, neg = 0;
-
-	mg->start_event_nr = mg->last_start;
-	mg->start_type = type;
-
-	log_group(mg, "start %d init %d type %d member_count %d",
-		  mg->last_start, mg->init, type, member_count);
-
-	recover_members(mg, member_count, nodeids, &pos, &neg);
-	reset_unfinished_recoveries(mg);
-
-	if (mg->init) {
-		if (member_count == 1)
-			start_first_mounter(mg);
-		else if (mg->spectator)
-			start_spectator_init(mg);
-		else
-			start_participant_init(mg);
-		mg->init = 0;
-	} else {
-		if (mg->spectator)
-			start_spectator(mg, pos, neg);
-		else
-			start_participant(mg, pos, neg);
-	}
-}
-
-/*
-  What repurcussions are there from umount shutting down gfs in the
-  kernel before we leave the mountgroup?  We can no longer participate
-  in recovery even though we're in the group -- what are the end cases
-  that we need to deal with where this causes a problem?  i.e. there
-  is a period of time where the mountgroup=A,B,C but the kernel fs
-  is only active on A,B, not C.  The mountgroup on A,B can't depend
-  on the mg on C to necessarily be able to do some things (recovery).
-
-  At least in part, it means that after we do an umount and have
-  removed the instance of this fs in the kernel, we'll still get
-  stop/start/finish callbacks from groupd for which we'll attempt
-  and fail to: block/unblock gfs kernel activity, initiate gfs
-  journal recovery, get recovery-done signals fromt eh kernel.
-  
-  We don't want to hang groupd event processing by failing to send
-  an ack (stop_done/start_done) back to groupd when it needs one
-  to procede.  In the case where we get a start for a failed node
-  that needs journal recovery, we have a problem because we wait to
-  call group_start_done() until gfs in the kernel to signal that
-  the journal recovery is done.  If we've unmounted gfs isn't there
-  any more to give us this signal and we'll never call start_done.
- 
-  update: we should be dealing with all these issues correctly now. */
-
-int do_terminate(struct mountgroup *mg)
-{
-	purge_plocks(mg, 0, 1);
-
-	if (mg->withdraw) {
-		log_group(mg, "termination of our withdraw leave");
-		set_sysfs(mg, "withdraw", 1);
-		list_move(&mg->list, &withdrawn_mounts);
-	} else {
-		log_group(mg, "termination of our unmount leave");
-		list_del(&mg->list);
-		free(mg);
-	}
-
-	return 0;
-}
-
-static int run_dmsetup_suspend(struct mountgroup *mg, char *dev)
-{
-	struct sched_param sched_param;
-	char buf[PATH_MAX];
-	pid_t pid;
-	int i, rv;
-
-	memset(buf, 0, sizeof(buf));
-	rv = readlink(dev, buf, PATH_MAX);
-	if (rv < 0)
-		strncpy(buf, dev, sizeof(buf));
-
-	log_group(mg, "run_dmsetup_suspend %s (orig %s)", buf, dev);
-
-	pid = fork();
-	if (pid < 0)
-		return -1;
-
-	if (pid) {
-		mg->dmsetup_wait = 1;
-		mg->dmsetup_pid = pid;
-		return 0;
-	} else {
-		sched_param.sched_priority = 0; 
-		sched_setscheduler(0, SCHED_OTHER, &sched_param);
-
-		for (i = 0; i < 50; i++)
-			close(i);
-	
-		execlp("dmsetup", "dmsetup", "suspend", buf, NULL);
-		exit(EXIT_FAILURE);
-	}
-	return -1;
-}
-
-/* The basic rule of withdraw is that we don't want to tell the kernel to drop
-   all locks until we know gfs has been stopped/blocked on all nodes.  They'll
-   be stopped for our leave, we just need to know when they've all arrived
-   there.
-
-   A withdrawing node is very much like a readonly node, differences are
-   that others recover its journal when they remove it from the group,
-   and when it's been removed from the group (gets terminate for its leave),
-   it tells the locally withdrawing gfs to clear out locks. */
-
-int do_withdraw(char *table)
-{
-	struct mountgroup *mg;
-	char *name = strstr(table, ":") + 1;
-	int rv;
-
-	if (config_no_withdraw) {
-		log_error("withdraw feature not enabled");
-		return 0;
-	}
-
-	mg = find_mg(name);
-	if (!mg) {
-		log_error("do_withdraw no mountgroup %s", name);
-		return -1;
-	}
-
-	rv = run_dmsetup_suspend(mg, mg->dev);
-	if (rv) {
-		log_error("do_withdraw %s: dmsetup %s error %d", mg->name,
-			  mg->dev, rv);
-		return -1;
-	}
-
-	dmsetup_wait = 1;
-	return 0;
-}
-
-void dmsetup_suspend_done(struct mountgroup *mg, int rv)
-{
-	log_group(mg, "dmsetup_suspend_done result %d", rv);
-	mg->dmsetup_wait = 0;
-	mg->dmsetup_pid = 0;
-
-	if (!rv) {
-		mg->withdraw = 1;
-		send_withdraw(mg);
-	}
-}
-
-void update_dmsetup_wait(void)
-{
-	struct mountgroup *mg;
-	int status;
-	int waiting = 0;
-	pid_t pid;
-
-	list_for_each_entry(mg, &mounts, list) {
-		if (mg->dmsetup_wait) {
-			pid = waitpid(mg->dmsetup_pid, &status, WNOHANG);
-
-			/* process not exited yet */
-			if (!pid) {
-				waiting++;
-				continue;
-			}
-
-			if (pid < 0) {
-				log_error("update_dmsetup_wait %s: waitpid %d "
-					  "error %d", mg->name,
-					  mg->dmsetup_pid, errno);
-				dmsetup_suspend_done(mg, -2);
-				continue;
-			}
-
-			/* process exited */
-
-			if (!WIFEXITED(status) || WEXITSTATUS(status))
-				dmsetup_suspend_done(mg, -1);
-			else
-				dmsetup_suspend_done(mg, 0);
-		}
-	}
-
-	if (!waiting) {
-		dmsetup_wait = 0;
-		log_debug("dmsetup_wait off");
-	}
-}
-
diff --git a/group/gfs_controld/util.c b/group/gfs_controld/util.c
new file mode 100644
index 0000000..70405bb
--- /dev/null
+++ b/group/gfs_controld/util.c
@@ -0,0 +1,197 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "gfs_daemon.h"
+#include "libfenced.h"
+
+int we_are_in_fence_domain(void)
+{
+	struct fenced_node nodeinfo;
+	int rv;
+
+	memset(&nodeinfo, 0, sizeof(nodeinfo));
+
+	rv = fenced_node_info(our_nodeid, &nodeinfo);
+	if (rv < 0) {
+		log_debug("fenced_node_info error %d", rv);
+		return 0;
+	}
+
+	if (nodeinfo.member)
+		return 1;
+	return 0;
+}
+
+#define SYSFS_DIR       "/sys/fs"
+#define SYSFS_BUFLEN    64
+
+int set_sysfs(struct mountgroup *mg, char *field, int val)
+{
+	char fname[PATH_MAX];
+	char out[SYSFS_BUFLEN];
+	int rv, fd;
+
+	snprintf(fname, PATH_MAX, "%s/%s/%s/lock_module/%s",
+		 SYSFS_DIR, mg->mount_args.type, mg->mount_args.table, field);
+
+	log_group(mg, "set %s to %d", fname, val);
+
+	fd = open(fname, O_RDWR);
+	if (fd < 0) {
+		log_group(mg, "set open %s error %d %d", fname, fd, errno);
+		return -1;
+	}
+
+	mg->got_kernel_mount = 1;
+
+	memset(out, 0, sizeof(out));
+	sprintf(out, "%d", val);
+
+	rv = write(fd, out, strlen(out));
+
+	close(fd);
+	return 0;
+}
+
+static int get_sysfs(struct mountgroup *mg, char *field, char *buf, int len)
+{
+	char fname[PATH_MAX], *p;
+	int fd, rv;
+
+	snprintf(fname, PATH_MAX, "%s/%s/%s/lock_module/%s",
+		 SYSFS_DIR, mg->mount_args.type, mg->mount_args.table, field);
+
+	fd = open(fname, O_RDONLY);
+	if (fd < 0) {
+		log_group(mg, "get open %s error %d %d", fname, fd, errno);
+		return -1;
+	}
+
+	mg->got_kernel_mount = 1;
+
+	rv = read(fd, buf, len);
+	if (rv < 0)
+		log_error("read %s error %d %d", fname, rv, errno);
+	else {
+		rv = 0;
+		p = strchr(buf, '\n');
+		if (p)
+			*p = '\0';
+	}
+
+	close(fd);
+	return rv;
+}
+
+int read_sysfs_int(struct mountgroup *mg, char *field, int *val_out)
+{
+	char buf[SYSFS_BUFLEN];
+	int rv;
+
+	memset(buf, 0, sizeof(buf));
+
+	rv = get_sysfs(mg, field, buf, sizeof(buf));
+	if (rv < 0)
+		return rv;
+
+	*val_out = atoi(buf);
+	return 0;
+}
+
+int run_dmsetup_suspend(struct mountgroup *mg, char *dev)
+{
+	struct sched_param sched_param;
+	char buf[PATH_MAX];
+	pid_t pid;
+	int i, rv;
+
+	memset(buf, 0, sizeof(buf));
+	rv = readlink(dev, buf, PATH_MAX);
+	if (rv < 0)
+		strncpy(buf, dev, sizeof(buf));
+
+	log_group(mg, "run_dmsetup_suspend %s (orig %s)", buf, dev);
+
+	pid = fork();
+	if (pid < 0)
+		return -1;
+
+	if (pid) {
+		mg->dmsetup_wait = 1;
+		mg->dmsetup_pid = pid;
+		return 0;
+	} else {
+		sched_param.sched_priority = 0;
+		sched_setscheduler(0, SCHED_OTHER, &sched_param);
+
+		for (i = 0; i < 50; i++)
+			close(i);
+
+		execlp("dmsetup", "dmsetup", "suspend", buf, NULL);
+		exit(EXIT_FAILURE);
+	}
+	return -1;
+}
+
+static void dmsetup_suspend_done(struct mountgroup *mg, int rv)
+{
+	log_group(mg, "dmsetup_suspend_done result %d", rv);
+	mg->dmsetup_wait = 0;
+	mg->dmsetup_pid = 0;
+
+	if (!rv) {
+		mg->withdraw = 1;
+		if (mg->old_group_mode)
+			send_withdraw_old(mg);
+	}
+}
+
+void update_dmsetup_wait(void)
+{
+	struct mountgroup *mg;
+	int status;
+	int waiting = 0;
+	pid_t pid;
+
+	list_for_each_entry(mg, &mountgroups, list) {
+		if (mg->dmsetup_wait) {
+			pid = waitpid(mg->dmsetup_pid, &status, WNOHANG);
+
+			/* process not exited yet */
+			if (!pid) {
+				waiting++;
+				continue;
+			}
+
+			if (pid < 0) {
+				log_error("update_dmsetup_wait %s: waitpid %d "
+					  "error %d", mg->name,
+					  mg->dmsetup_pid, errno);
+				dmsetup_suspend_done(mg, -2);
+				continue;
+			}
+
+			/* process exited */
+
+			if (!WIFEXITED(status) || WEXITSTATUS(status))
+				dmsetup_suspend_done(mg, -1);
+			else
+				dmsetup_suspend_done(mg, 0);
+		}
+	}
+
+	if (!waiting) {
+		dmsetup_wait = 0;
+		log_debug("dmsetup_wait off");
+	}
+}
+
diff --git a/group/libgfscontrol/Makefile b/group/libgfscontrol/Makefile
new file mode 100644
index 0000000..40d12d6
--- /dev/null
+++ b/group/libgfscontrol/Makefile
@@ -0,0 +1,53 @@
+###############################################################################
+###############################################################################
+##
+##  Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+##  
+##  This copyrighted material is made available to anyone wishing to use,
+##  modify, copy, or redistribute it subject to the terms and conditions
+##  of the GNU General Public License v.2.
+##
+###############################################################################
+###############################################################################
+
+TARGET= libgfscontrol
+
+LIBDIRT=$(TARGET).a \
+	$(TARGET).so.$(SOMAJOR).$(SOMINOR)
+
+LIBSYMT=$(TARGET).so \
+	$(TARGET).so.$(SOMAJOR)
+
+INCDIRT=$(TARGET).h
+
+include ../../make/defines.mk
+
+SHAREDLIB=$(TARGET).so.${SOMAJOR}.${SOMINOR}
+STATICLIB=$(TARGET).a
+
+all: $(STATICLIB) $(SHAREDLIB)
+
+include $(OBJDIR)/make/cobj.mk
+include $(OBJDIR)/make/clean.mk
+include $(OBJDIR)/make/install.mk
+include $(OBJDIR)/make/uninstall.mk
+
+OBJS=	main.o
+
+CFLAGS += -fPIC
+CFLAGS += -I$(S)/../../group/gfs_controld
+CFLAGS += -I${incdir}
+CFLAGS += -I$(KERNEL_SRC)/include
+
+$(TARGET).a: $(OBJS)
+	${AR} r $@ $^
+	${RANLIB} $@
+
+$(TARGET).so.${SOMAJOR}.${SOMINOR}: $(OBJS)
+	$(CC) -shared -o $@ -Wl,-soname=$(TARGET).so.$(SOMAJOR) $< $(LDFLAGS)
+	ln -sf $(TARGET).so.$(SOMAJOR).$(SOMINOR) $(TARGET).so
+	ln -sf $(TARGET).so.$(SOMAJOR).$(SOMINOR) $(TARGET).so.$(SOMAJOR)
+
+clean: generalclean
+
+-include $(OBJS:.o=.d)
diff --git a/group/libgfscontrol/libgfscontrol.h b/group/libgfscontrol/libgfscontrol.h
new file mode 100644
index 0000000..3a151ba
--- /dev/null
+++ b/group/libgfscontrol/libgfscontrol.h
@@ -0,0 +1,131 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public
+**  License as published by the Free Software Foundation; either
+**  version 2 of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free Software
+**  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef _LIBGFSCONTROL_H_
+#define _LIBGFSCONTROL_H_
+
+/* Maximum mountgroup name length, should match DLM_LOCKSPACE_LEN from
+   linux/dlmconstants.h.  The libcpg limit is larger at
+   CPG_MAX_NAME_LENGTH 128.  Our cpg name includes a "gfs:" prefix before
+   the mountgroup name. */
+
+#define GFS_MOUNTGROUP_LEN	64
+
+#define GFSC_DUMP_SIZE		(1024 * 1024)
+
+#define GFSC_NF_MEMBER		0x00000001 /* node is member in cg */
+#define GFSC_NF_START		0x00000002 /* start message recvd for cg */
+#define GFSC_NF_DISALLOWED	0x00000004 /* node disallowed in cg */
+
+struct gfsc_node {
+	int nodeid;
+	uint32_t flags;
+	uint32_t added_seq;
+	uint32_t removed_seq;
+	int failed_reason;
+};
+
+struct gfsc_change {
+	int member_count;
+	int joined_count;
+	int remove_count;
+	int failed_count;
+	int wait_condition;	/* 0 no, 1 fencing, 2 quorum, 3 fs */
+	int wait_messages;	/* 0 no, 1 yes */
+	uint32_t seq;
+	uint32_t combined_seq;
+};
+
+#define GFSC_LF_JOINING		0x00000001
+#define GFSC_LF_LEAVING		0x00000002
+#define GFSC_LF_KERNEL_STOPPED	0x00000004
+
+struct gfsc_mountgroup {
+	struct gfsc_change cg_prev;	/* completed change (started_change) */
+	struct gfsc_change cg_next;	/* in-progress change (changes list) */
+	uint32_t flags;
+	uint32_t global_id;
+	char name[GFS_MOUNTGROUP_LEN+1];
+};
+
+/* gfsc_mountgroup_nodes() types
+
+   MEMBERS: members in completed (prev) change,
+            zero if there's no completed (prev) change
+   NEXT:    members in in-progress (next) change,
+            zero if there's no in-progress (next) change
+   ALL:     NEXT + nonmembers if there's an in-progress (next) change,
+            MEMBERS + nonmembers if there's no in-progress (next) change, but
+            there is a completed (prev) change
+            nonmembers if there's no in-progress (next) or completed (prev)
+            change (possible?)
+
+   gfsc_node_info() returns info for in-progress (next) change, if one exists,
+   otherwise it returns info for completed (prev) change.
+*/
+
+#define GFSC_NODES_ALL		1
+#define GFSC_NODES_MEMBERS	2
+#define GFSC_NODES_NEXT		3
+
+int gfsc_dump_debug(char *buf);
+int gfsc_dump_plocks(char *name, char *buf);
+int gfsc_mountgroup_info(char *mgname, struct gfsc_mountgroup *mg);
+int gfsc_node_info(char *mgname, int nodeid, struct gfsc_node *node);
+int gfsc_mountgroups(int max, int *count, struct gfsc_mountgroup *mgs);
+int gfsc_mountgroup_nodes(char *mgname, int type, int max, int *count,
+			 struct gfsc_node *nodes);
+
+struct gfsc_mount_args {
+	char dir[PATH_MAX];
+	char type[PATH_MAX];
+	char proto[PATH_MAX];
+	char table[PATH_MAX];
+	char options[PATH_MAX];
+	char dev[PATH_MAX];
+	char hostdata[PATH_MAX];
+};
+
+/*
+ * mount.gfs connects to gfs_controld,
+ * mount.gfs tells gfs_controld to do a join or remount,
+ * mount.gfs reads the result of the join or remount from gfs_controld,
+ * mount.gfs tells gfs_controld the result of the mount(2),
+ * mount.gfs disconnects from gfs_controld
+ */
+
+int gfsc_fs_connect(void);
+int gfsc_fs_join(int fd, struct gfsc_mount_args *ma);
+int gfsc_fs_remount(int fd, struct gfsc_mount_args *ma);
+int gfsc_fs_result(int fd, int *result, struct gfsc_mount_args *ma);
+int gfsc_fs_mount_done(int fd, struct gfsc_mount_args *ma, int result);
+void gfsc_fs_disconnect(int fd);
+
+/*
+ * mount.gfs tells gfs_controld to do a leave (due to a mount failure)
+ * for unmount, gfs_controld leaves due to a message from the kernel
+ */
+
+int gfsc_fs_leave(struct gfsc_mount_args *ma, int reason);
+
+#endif
+
diff --git a/group/libgfscontrol/main.c b/group/libgfscontrol/main.c
new file mode 100644
index 0000000..35f6f43
--- /dev/null
+++ b/group/libgfscontrol/main.c
@@ -0,0 +1,437 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "libgfscontrol.h"
+#include "gfs_controld.h"
+
+static int do_read(int fd, void *buf, size_t count)
+{
+	int rv, off = 0;
+
+	while (off < count) {
+		rv = read(fd, buf + off, count - off);
+		if (rv == 0)
+			return -1;
+		if (rv == -1 && errno == EINTR)
+			continue;
+		if (rv == -1)
+			return -1;
+		off += rv;
+	}
+	return 0;
+}
+
+static int do_write(int fd, void *buf, size_t count)
+{
+	int rv, off = 0;
+
+ retry:
+	rv = write(fd, buf + off, count);
+	if (rv == -1 && errno == EINTR)
+		goto retry;
+	if (rv < 0) {
+		return rv;
+	}
+
+	if (rv != count) {
+		count -= rv;
+		off += rv;
+		goto retry;
+	}
+	return 0;
+}
+
+static int do_connect(char *sock_path)
+{
+	struct sockaddr_un sun;
+	socklen_t addrlen;
+	int rv, fd;
+
+	fd = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0)
+		goto out;
+
+	memset(&sun, 0, sizeof(sun));
+	sun.sun_family = AF_UNIX;
+	strcpy(&sun.sun_path[1], sock_path);
+	addrlen = sizeof(sa_family_t) + strlen(sun.sun_path+1) + 1;
+
+	rv = connect(fd, (struct sockaddr *) &sun, addrlen);
+	if (rv < 0) {
+		close(fd);
+		fd = rv;
+	}
+ out:
+	return fd;
+}
+
+static void init_header(struct gfsc_header *h, int cmd, char *name,
+			int extra_len)
+{
+	memset(h, 0, sizeof(struct gfsc_header));
+
+	h->magic = GFSC_MAGIC;
+	h->version = GFSC_VERSION;
+	h->len = sizeof(struct gfsc_header) + extra_len;
+	h->command = cmd;
+
+	if (name)
+		strncpy(h->name, name, GFS_MOUNTGROUP_LEN);
+}
+
+int do_dump(int cmd, char *name, char *buf)
+{
+	struct gfsc_header h, *rh;
+	char *reply;
+	int reply_len;
+	int fd, rv;
+
+	init_header(&h, cmd, name, 0);
+
+	reply_len = sizeof(struct gfsc_header) + GFSC_DUMP_SIZE;
+	reply = malloc(reply_len);
+	if (!reply) {
+		rv = -1;
+		goto out;
+	}
+	memset(reply, 0, reply_len);
+
+	fd = do_connect(GFSC_QUERY_SOCK_PATH);
+	if (fd < 0) {
+		rv = fd;
+		goto out;
+	}
+
+	rv = do_write(fd, &h, sizeof(h));
+	if (rv < 0)
+		goto out_close;
+
+	/* won't always get back the full reply_len */
+	do_read(fd, reply, reply_len);
+
+	rh = (struct gfsc_header *)reply;
+	rv = rh->data;
+	if (rv < 0)
+		goto out_close;
+
+	memcpy(buf, (char *)reply + sizeof(struct gfsc_header),
+	       GFSC_DUMP_SIZE);
+ out_close:
+	close(fd);
+ out:
+	return rv;
+}
+
+int gfsc_dump_debug(char *buf)
+{
+	return do_dump(GFSC_CMD_DUMP_DEBUG, NULL, buf);
+}
+
+int gfsc_dump_plocks(char *name, char *buf)
+{
+	return do_dump(GFSC_CMD_DUMP_PLOCKS, name, buf);
+}
+
+int gfsc_node_info(char *name, int nodeid, struct gfsc_node *node)
+{
+	struct gfsc_header h, *rh;
+	char reply[sizeof(struct gfsc_header) + sizeof(struct gfsc_node)];
+	int fd, rv;
+
+	init_header(&h, GFSC_CMD_NODE_INFO, name, 0);
+	h.data = nodeid;
+
+	memset(reply, 0, sizeof(reply));
+
+	fd = do_connect(GFSC_QUERY_SOCK_PATH);
+	if (fd < 0) {
+		rv = fd;
+		goto out;
+	}
+
+	rv = do_write(fd, &h, sizeof(h));
+	if (rv < 0)
+		goto out_close;
+
+	rv = do_read(fd, reply, sizeof(reply));
+	if (rv < 0)
+		goto out_close;
+
+	rh = (struct gfsc_header *)reply;
+	rv = rh->data;
+	if (rv < 0)
+		goto out_close;
+
+	memcpy(node, (char *)reply + sizeof(struct gfsc_header),
+	       sizeof(struct gfsc_node));
+ out_close:
+	close(fd);
+ out:
+	return rv;
+}
+
+int gfsc_mountgroup_info(char *name, struct gfsc_mountgroup *mountgroup)
+{
+	struct gfsc_header h, *rh;
+	char reply[sizeof(struct gfsc_header) + sizeof(struct gfsc_mountgroup)];
+	int fd, rv;
+
+	init_header(&h, GFSC_CMD_MOUNTGROUP_INFO, name, 0);
+
+	memset(reply, 0, sizeof(reply));
+
+	fd = do_connect(GFSC_QUERY_SOCK_PATH);
+	if (fd < 0) {
+		rv = fd;
+		goto out;
+	}
+
+	rv = do_write(fd, &h, sizeof(h));
+	if (rv < 0)
+		goto out_close;
+
+	rv = do_read(fd, reply, sizeof(reply));
+	if (rv < 0)
+		goto out_close;
+
+	rh = (struct gfsc_header *)reply;
+	rv = rh->data;
+	if (rv < 0)
+		goto out_close;
+
+	memcpy(mountgroup, (char *)reply + sizeof(struct gfsc_header),
+	       sizeof(struct gfsc_mountgroup));
+ out_close:
+	close(fd);
+ out:
+	return rv;
+}
+
+int gfsc_mountgroups(int max, int *count, struct gfsc_mountgroup *mgs)
+{
+	struct gfsc_header h, *rh;
+	char *reply;
+	int reply_len;
+	int fd, rv, result, mg_count;
+
+	init_header(&h, GFSC_CMD_MOUNTGROUPS, NULL, 0);
+	h.data = max;
+
+	reply_len = sizeof(struct gfsc_header) +
+		    (max * sizeof(struct gfsc_mountgroup));
+	reply = malloc(reply_len);
+	if (!reply) {
+		rv = -1;
+		goto out;
+	}
+	memset(reply, 0, reply_len);
+
+	fd = do_connect(GFSC_QUERY_SOCK_PATH);
+	if (fd < 0) {
+		rv = fd;
+		goto out;
+	}
+
+	rv = do_write(fd, &h, sizeof(h));
+	if (rv < 0)
+		goto out_close;
+
+	/* won't usually get back the full reply_len */
+	do_read(fd, reply, reply_len);
+
+	rh = (struct gfsc_header *)reply;
+	result = rh->data;
+	if (result < 0 && result != -E2BIG) {
+		rv = result;
+		goto out_close;
+	}
+
+	if (result == -E2BIG) {
+		*count = -E2BIG;
+		mg_count = max;
+	} else {
+		*count = result;
+		mg_count = result;
+	}
+	rv = 0;
+
+	memcpy(mgs, (char *)reply + sizeof(struct gfsc_header),
+	       mg_count * sizeof(struct gfsc_mountgroup));
+ out_close:
+	close(fd);
+ out:
+	return rv;
+}
+
+int gfsc_mountgroup_nodes(char *name, int type, int max, int *count,
+			 struct gfsc_node *nodes)
+{
+	struct gfsc_header h, *rh;
+	char *reply;
+	int reply_len;
+	int fd, rv, result, node_count;
+
+	init_header(&h, GFSC_CMD_MOUNTGROUP_NODES, name, 0);
+	h.option = type;
+	h.data = max;
+
+	reply_len = sizeof(struct gfsc_header) +
+		    (max * sizeof(struct gfsc_node));
+	reply = malloc(reply_len);
+	if (!reply) {
+		rv = -1;
+		goto out;
+	}
+	memset(reply, 0, reply_len);
+
+	fd = do_connect(GFSC_QUERY_SOCK_PATH);
+	if (fd < 0) {
+		rv = fd;
+		goto out;
+	}
+
+	rv = do_write(fd, &h, sizeof(h));
+	if (rv < 0)
+		goto out_close;
+
+	/* won't usually get back the full reply_len */
+	do_read(fd, reply, reply_len);
+
+	rh = (struct gfsc_header *)reply;
+	result = rh->data;
+	if (result < 0 && result != -E2BIG) {
+		rv = result;
+		goto out_close;
+	}
+
+	if (result == -E2BIG) {
+		*count = -E2BIG;
+		node_count = max;
+	} else {
+		*count = result;
+		node_count = result;
+	}
+	rv = 0;
+
+	memcpy(nodes, (char *)reply + sizeof(struct gfsc_header),
+	       node_count * sizeof(struct gfsc_node));
+ out_close:
+	close(fd);
+ out:
+	return rv;
+}
+
+int gfsc_fs_connect(void)
+{
+	return do_connect(GFSC_SOCK_PATH);
+}
+
+void gfsc_fs_disconnect(int fd)
+{
+	close(fd);
+}
+
+int gfsc_fs_join(int fd, struct gfsc_mount_args *ma)
+{
+	char msg[sizeof(struct gfsc_header) + sizeof(struct gfsc_mount_args)];
+	struct gfsc_header *h = (struct gfsc_header *)msg;
+	char *name = strstr(ma->table, ":") + 1;
+
+	init_header(h, GFSC_CMD_FS_JOIN, name, sizeof(struct gfsc_mount_args));
+
+	memcpy(msg + sizeof(struct gfsc_header), ma,
+	       sizeof(struct gfsc_mount_args));
+
+	return do_write(fd, msg, sizeof(msg));
+}
+
+int gfsc_fs_remount(int fd, struct gfsc_mount_args *ma)
+{
+	char msg[sizeof(struct gfsc_header) + sizeof(struct gfsc_mount_args)];
+	struct gfsc_header *h = (struct gfsc_header *)msg;
+	char *name = strstr(ma->table, ":") + 1;
+
+	init_header(h, GFSC_CMD_FS_REMOUNT, name,
+		    sizeof(struct gfsc_mount_args));
+
+	memcpy(msg + sizeof(struct gfsc_header), ma,
+	       sizeof(struct gfsc_mount_args));
+
+	return do_write(fd, msg, sizeof(msg));
+}
+
+int gfsc_fs_result(int fd, int *result, struct gfsc_mount_args *ma)
+{
+	char reply[sizeof(struct gfsc_header) + sizeof(struct gfsc_mount_args)];
+	struct gfsc_header *h = (struct gfsc_header *)reply;
+	int rv;
+
+	rv = do_read(fd, reply, sizeof(reply));
+	if (rv < 0)
+		goto out;
+
+	*result = h->data;
+
+	memcpy(ma, reply + sizeof(struct gfsc_header),
+	       sizeof(struct gfsc_mount_args));
+ out:
+	return rv;
+}
+
+int gfsc_fs_mount_done(int fd, struct gfsc_mount_args *ma, int result)
+{
+	char msg[sizeof(struct gfsc_header) + sizeof(struct gfsc_mount_args)];
+	struct gfsc_header *h = (struct gfsc_header *)msg;
+	char *name = strstr(ma->table, ":") + 1;
+
+	init_header(h, GFSC_CMD_FS_MOUNT_DONE, name,
+		    sizeof(struct gfsc_mount_args));
+
+	h->data = result;
+
+	memcpy(msg + sizeof(struct gfsc_header), ma,
+	       sizeof(struct gfsc_mount_args));
+
+	return do_write(fd, msg, sizeof(msg));
+}
+
+int gfsc_fs_leave(struct gfsc_mount_args *ma, int reason)
+{
+	char msg[sizeof(struct gfsc_header) + sizeof(struct gfsc_mount_args)];
+	struct gfsc_header *h = (struct gfsc_header *)msg;
+	char *name = strstr(ma->table, ":") + 1;
+	int fd;
+
+	init_header(h, GFSC_CMD_FS_LEAVE, name,
+		    sizeof(struct gfsc_mount_args));
+
+	h->data = reason;
+
+	memcpy(msg + sizeof(struct gfsc_header), ma,
+	       sizeof(struct gfsc_mount_args));
+
+	fd = do_connect(GFSC_SOCK_PATH);
+	if (fd < 0)
+		return fd;
+
+	return do_write(fd, msg, sizeof(msg));
+}
+


hooks/post-receive
--
Cluster Project