Add cluster support.

author Alasdair Kergon <agk@redhat.com>

Thu, 24 Jun 2004 08:02:38 +0000 (08:02 +0000)

committer Alasdair Kergon <agk@redhat.com>

Thu, 24 Jun 2004 08:02:38 +0000 (08:02 +0000)
author Alasdair Kergon <agk@redhat.com>
Thu, 24 Jun 2004 08:02:38 +0000 (08:02 +0000)
committer Alasdair Kergon <agk@redhat.com>
Thu, 24 Jun 2004 08:02:38 +0000 (08:02 +0000)
diff --git a/Makefile.in b/Makefile.in

index f7b4d5238270d3d136804ecd20eac6475f2d31de..af63fc5d9ba2d827286ab833af6358404d6d47a9 100644 (file)
--- a/Makefile.in
+++ b/Makefile.in
@@ -22,11 +22,13 @@ ifeq ("@INTL@", "yes")
    SUBDIRS += po
  endif
  
-SUBDIRS += lib tools
+SUBDIRS += lib tools daemons
  
  ifeq ($(MAKECMDGOALS),distclean)
-  SUBDIRS += lib/format1 \
+  SUBDIRS += daemons/clvmd \
+            lib/format1 \
              lib/format_pool \
+            lib/locking \
              lib/mirror \
              lib/snapshot \
              po \
@@ -35,14 +37,16 @@ endif
  
  include make.tmpl
  
+daemons: lib
  lib: include
  tools: lib
-po: lib tools
+po: tools daemons
  
  ifeq ("@INTL@", "yes")
  lib.pofile: include.pofile
  tools.pofile: lib.pofile
-po.pofile: lib.pofile tools.pofile
+daemons.pofile: lib.pofile
+po.pofile: tools.pofile daemons.pofile
  pofile: po.pofile
  endif
  
diff --git a/VERSION b/VERSION

index f2ad88206dbf06a30b754c5c2b328ae001e1914f..a9f466bfbf9658654d445b89841e5cbff086b6b3 100644 (file)
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.00.17-cvs (2004-06-20)
+2.00.18-cvs (2004-06-24)
diff --git a/WHATS_NEW b/WHATS_NEW

index b582f61bfdd22864512d48fb0b14e4d80b1fced0..94aadf13c1fc43e72c4443b59f997d1300fa94af 100644 (file)
--- a/WHATS_NEW
+++ b/WHATS_NEW
@@ -1,3 +1,7 @@
+Version 2.00.18 - 24 June 2004
+==============================
+  Add cluster support.
+
  Version 2.00.17 - 20 June 2004
  ==============================
    configure --enable-fsadm to try out fsadm.  fsadm is not tested yet.
diff --git a/configure b/configure

index 783dbddad7feb8366515889c9bee3eb3d62626d5..d5efca06d921788055926b91e71fc43e2904566a 100755 (executable)
--- a/configure
+++ b/configure
@@ -309,7 +309,7 @@ ac_includes_default="\
  #endif"
  
  ac_default_prefix=/usr
-ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS AWK CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA LN_S SET_MAKE RANLIB ac_ct_RANLIB CPP EGREP build build_cpu build_vendor build_os host host_cpu host_vendor host_os target target_cpu target_vendor target_os MSGFMT JOBS STATIC_LINK LVM1 POOL SNAPSHOTS MIRRORS OWNER GROUP CLDFLAGS CLDWHOLEARCHIVE CLDNOWHOLEARCHIVE LD_DEPS LD_FLAGS SOFLAG LVM_VERSION LVM1_FALLBACK DEBUG DEVMAPPER HAVE_LIBDL HAVE_SELINUX CMDLIB LOCALEDIR CONFDIR STATICDIR INTL_PACKAGE INTL FSADM LIBOBJS LTLIBOBJS'
+ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS AWK CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA LN_S SET_MAKE RANLIB ac_ct_RANLIB CPP EGREP build build_cpu build_vendor build_os host host_cpu host_vendor host_os target target_cpu target_vendor target_os MSGFMT JOBS STATIC_LINK LVM1 POOL SNAPSHOTS MIRRORS OWNER GROUP CLDFLAGS CLDWHOLEARCHIVE CLDNOWHOLEARCHIVE LD_DEPS LD_FLAGS SOFLAG LVM_VERSION LVM1_FALLBACK DEBUG DEVMAPPER HAVE_LIBDL HAVE_SELINUX CMDLIB LOCALEDIR CONFDIR STATICDIR INTL_PACKAGE INTL CLVMD CLUSTER FSADM LIBOBJS LTLIBOBJS'
  ac_subst_files=''
  
  # Initialize some variables set by options.
@@ -867,10 +867,13 @@ Optional Packages:
                            TYPE=internal
    --with-pool=TYPE        GFS pool read-only support: internal/shared/none
                            TYPE=internal
+  --with-cluster=TYPE     Cluster LVM locking support: internal/shared/none
+                          TYPE=internal
    --with-snapshots=TYPE   Snapshot support: internal/shared/none
                            TYPE=internal
    --with-mirrors=TYPE     Mirror support: internal/shared/none
                            TYPE=internal
+  --with-clvmd            Build cluster LVM Daemon
    --with-localedir=DIR    Translation files in DIR PREFIX/share/locale
    --with-confdir=DIR      Configuration files in DIR /etc
    --with-staticdir=DIR    Static binary in DIR EXEC_PREFIX/sbin
@@ -3900,6 +3903,7 @@ case "$host_os" in
                 SOFLAG="-shared"
                 DEVMAPPER=yes
                 ODIRECT=yes
+               CLUSTER=internal
                 FSADM=no ;;
         darwin*)
                 CFLAGS="-no-cpp-precomp -fno-common"
@@ -3911,6 +3915,7 @@ case "$host_os" in
                 SOFLAG="-dynamiclib"
                 DEVMAPPER=no
                 ODIRECT=no
+               CLUSTER=none
                 FSADM=no ;;
  esac
  
@@ -3998,6 +4003,25 @@ if test x$POOL = xinternal; then
  fi
  
  
+# Check whether --with-cluster or --without-cluster was given.
+if test "${with_cluster+set}" = set; then
+  withval="$with_cluster"
+   CLUSTER="$withval"
+fi;
+
+if [ "x$CLUSTER" != xnone -a "x$CLUSTER" != xinternal -a "x$CLUSTER" != xshared ];
+ then  { { echo "$as_me:$LINENO: error: --with-cluster parameter invalid
+" >&5
+echo "$as_me: error: --with-cluster parameter invalid
+" >&2;}
+   { (exit 1); exit 1; }; }
+ exit
+fi;
+
+if test x$CLUSTER = xinternal; then
+       CFLAGS="$CFLAGS -DCLUSTER_LOCKING_INTERNAL"
+fi
+
  # Check whether --enable-jobs or --disable-jobs was given.
  if test "${enable_jobs+set}" = set; then
    enableval="$enable_jobs"
@@ -4071,6 +4095,20 @@ if test x$READLINE = xyes; then
         CFLAGS="$CFLAGS -DREADLINE_SUPPORT"
  fi
  
+
+# Check whether --with-clvmd or --without-clvmd was given.
+if test "${with_clvmd+set}" = set; then
+  withval="$with_clvmd"
+  \
+CLVMD=$withval
+else
+  CLVMD=no
+fi;
+if  test x$CLVMD = xyes && test x$CLUSTER = xnone; then
+       CLUSTER=internal
+fi
+echo "$ac_t""$CLVMD" 1>&6
+
  echo $ac_n "checking whether to enable debugging""... $ac_c" 1>&6
  # Check whether --enable-debug or --disable-debug was given.
  if test "${enable_debug+set}" = set; then
@@ -4698,7 +4736,7 @@ else
         HAVE_LIBDL=no
  fi
  
-if [ \( "x$LVM1" = xshared -o "x$POOL" = xshared -o \
+if [ \( "x$LVM1" = xshared -o "x$POOL" = xshared -o "x$CLUSTER" = xshared -o \
        "x$SNAPSHOTS" = xshared -o "x$MIRRORS" = xshared \
        \) -a "x$STATIC_LINK" = xyes ];
   then  { { echo "$as_me:$LINENO: error: Features cannot be 'shared' when building statically
@@ -5207,7 +5245,9 @@ fi
  
  
  
-                                                                                                                                                                                              ac_config_files="$ac_config_files Makefile make.tmpl doc/Makefile include/Makefile lib/Makefile lib/format1/Makefile lib/format_pool/Makefile lib/mirror/Makefile lib/snapshot/Makefile man/Makefile po/Makefile tools/Makefile tools/version.h tools/fsadm/Makefile test/mm/Makefile test/device/Makefile test/format1/Makefile test/regex/Makefile test/filters/Makefile"
+
+
+                                                                                                                                                                                                                            ac_config_files="$ac_config_files Makefile make.tmpl daemons/Makefile daemons/clvmd/Makefile doc/Makefile include/Makefile lib/Makefile lib/format1/Makefile lib/format_pool/Makefile lib/locking/Makefile lib/mirror/Makefile lib/snapshot/Makefile man/Makefile po/Makefile tools/Makefile tools/version.h tools/fsadm/Makefile test/mm/Makefile test/device/Makefile test/format1/Makefile test/regex/Makefile test/filters/Makefile"
  cat >confcache <<\_ACEOF
  # This file is a shell script that caches the results of configure
  # tests run on this system so they can be shared between configure
@@ -5760,11 +5800,14 @@ do
    # Handling of arguments.
    "Makefile" ) CONFIG_FILES="$CONFIG_FILES Makefile" ;;
    "make.tmpl" ) CONFIG_FILES="$CONFIG_FILES make.tmpl" ;;
+  "daemons/Makefile" ) CONFIG_FILES="$CONFIG_FILES daemons/Makefile" ;;
+  "daemons/clvmd/Makefile" ) CONFIG_FILES="$CONFIG_FILES daemons/clvmd/Makefile" ;;
    "doc/Makefile" ) CONFIG_FILES="$CONFIG_FILES doc/Makefile" ;;
    "include/Makefile" ) CONFIG_FILES="$CONFIG_FILES include/Makefile" ;;
    "lib/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/Makefile" ;;
    "lib/format1/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/format1/Makefile" ;;
    "lib/format_pool/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/format_pool/Makefile" ;;
+  "lib/locking/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/locking/Makefile" ;;
    "lib/mirror/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/mirror/Makefile" ;;
    "lib/snapshot/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/snapshot/Makefile" ;;
    "man/Makefile" ) CONFIG_FILES="$CONFIG_FILES man/Makefile" ;;
@@ -5916,6 +5959,8 @@ s,@CONFDIR@,$CONFDIR,;t t
  s,@STATICDIR@,$STATICDIR,;t t
  s,@INTL_PACKAGE@,$INTL_PACKAGE,;t t
  s,@INTL@,$INTL,;t t
+s,@CLVMD@,$CLVMD,;t t
+s,@CLUSTER@,$CLUSTER,;t t
  s,@FSADM@,$FSADM,;t t
  s,@LIBOBJS@,$LIBOBJS,;t t
  s,@LTLIBOBJS@,$LTLIBOBJS,;t t
diff --git a/configure.in b/configure.in

index c07b020a8d77df21c8ebf062102e40e318d36152..95d3a9726a4e8684989725c5e43ba4a6888b8083 100644 (file)
--- a/configure.in
+++ b/configure.in
@@ -59,6 +59,7 @@ case "$host_os" in
                 SOFLAG="-shared"
                 DEVMAPPER=yes
                 ODIRECT=yes
+               CLUSTER=internal
                 FSADM=no ;;
         darwin*)
                 CFLAGS="-no-cpp-precomp -fno-common"
@@ -70,6 +71,7 @@ case "$host_os" in
                 SOFLAG="-dynamiclib"
                 DEVMAPPER=no
                 ODIRECT=no
+               CLUSTER=none
                 FSADM=no ;;
  esac
  
@@ -141,6 +143,22 @@ if test x$POOL = xinternal; then
         CFLAGS="$CFLAGS -DPOOL_INTERNAL"
  fi
  
+dnl -- cluster_locking inclusion type
+AC_ARG_WITH(cluster,
+  [  --with-cluster=TYPE     Cluster LVM locking support: internal/shared/none
+                          [TYPE=internal] ],
+  [ CLUSTER="$withval" ])
+
+if [[ "x$CLUSTER" != xnone -a "x$CLUSTER" != xinternal -a "x$CLUSTER" != xshared ]];
+ then  AC_MSG_ERROR(
+--with-cluster parameter invalid
+)
+ exit
+fi;
+
+if test x$CLUSTER = xinternal; then
+       CFLAGS="$CFLAGS -DCLUSTER_LOCKING_INTERNAL"
+fi
  
  AC_ARG_ENABLE(jobs, [  --enable-jobs=NUM       Number of jobs to run simultaneously], JOBS=-j$enableval, JOBS=-j2)
  
@@ -192,6 +210,15 @@ if test x$READLINE = xyes; then
         CFLAGS="$CFLAGS -DREADLINE_SUPPORT"
  fi
  
+dnl Build cluster LVM daemon
+AC_ARG_WITH(clvmd, [  --with-clvmd            Build cluster LVM Daemon],  \
+CLVMD=$withval, CLVMD=no)
+dnl If clvmd enabled and not cluster locking, automgically include the locking.
+if  test x$CLVMD = xyes && test x$CLUSTER = xnone; then
+       CLUSTER=internal
+fi
+echo "$ac_t""$CLVMD" 1>&6
+
  echo $ac_n "checking whether to enable debugging""... $ac_c" 1>&6
  dnl Enable Debugging
  AC_ARG_ENABLE(debug,    [  --enable-debug          Enable debugging],  \
@@ -272,7 +299,7 @@ else
  fi
  
  dnl Check for shared/static conflicts
-if [[ \( "x$LVM1" = xshared -o "x$POOL" = xshared -o \
+if [[ \( "x$LVM1" = xshared -o "x$POOL" = xshared -o "x$CLUSTER" = xshared -o \
        "x$SNAPSHOTS" = xshared -o "x$MIRRORS" = xshared \
        \) -a "x$STATIC_LINK" = xyes ]];
   then  AC_MSG_ERROR(
@@ -377,6 +404,8 @@ AC_SUBST(CONFDIR)
  AC_SUBST(STATICDIR)
  AC_SUBST(INTL_PACKAGE)
  AC_SUBST(INTL)
+AC_SUBST(CLVMD)
+AC_SUBST(CLUSTER)
  AC_SUBST(FSADM)
  
  dnl First and last lines should not contain files to generate in order to 
@@ -384,11 +413,14 @@ dnl keep utility scripts running properly
  AC_OUTPUT(                                                             \
  Makefile                                                               \
  make.tmpl                                                               \
+daemons/Makefile                                                       \
+daemons/clvmd/Makefile                                                 \
  doc/Makefile                                                           \
  include/Makefile                                                       \
  lib/Makefile                                                           \
  lib/format1/Makefile                                                   \
  lib/format_pool/Makefile                                               \
+lib/locking/Makefile                                                   \
  lib/mirror/Makefile                                                    \
  lib/snapshot/Makefile                                                  \
  man/Makefile                                                           \
diff --git a/daemons/Makefile.in b/daemons/Makefile.in

new file mode 100644 (file)

index 0000000..a951632
--- /dev/null
+++ b/daemons/Makefile.in
@@ -0,0 +1,23 @@
+#
+# Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+#
+# This file is part of the LVM2.
+#
+# This copyrighted material is made available to anyone wishing to use,
+# modify, copy, or redistribute it subject to the terms and conditions
+# of the GNU General Public License v.2.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+
+ifeq ("@CLVMD@", "yes")
+  SUBDIRS = clvmd
+endif
+
+include $(top_srcdir)/make.tmpl
+
diff --git a/daemons/clvmd/Makefile.in b/daemons/clvmd/Makefile.in

new file mode 100644 (file)

index 0000000..54563e7
--- /dev/null
+++ b/daemons/clvmd/Makefile.in
@@ -0,0 +1,47 @@
+#
+# Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+#
+# This file is part of the LVM2.
+#
+# This copyrighted material is made available to anyone wishing to use,
+# modify, copy, or redistribute it subject to the terms and conditions
+# of the GNU General Public License v.2.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+
+SOURCES = \
+       clvmd-cman.c     \
+       clvmd-command.c  \
+       clvmd.c          \
+       libclvm.c        \
+       lvm-functions.c  \
+       system-lv.c
+
+TARGETS = \
+       clvmd
+
+include $(top_srcdir)/make.tmpl
+
+CFLAGS += -D_REENTRANT -fno-strict-aliasing
+LIBS += -ldevmapper -ldlm -llvm -lpthread
+
+INSTALL_TARGETS = \
+       install_clvmd
+
+clvmd: $(OBJECTS) $(top_srcdir)/lib/liblvm.a
+       $(CC) -o clvmd $(OBJECTS) $(LD_FLAGS) $(LVMLIBS) $(LIBS)
+
+.PHONY: install_clvmd
+
+install_clvmd: $(TARGETS)
+       $(INSTALL) -D $(OWNER) $(GROUP) -m 555 $(STRIP) clvmd \
+               $(sbindir)/clvmd
+
+install: $(INSTALL_TARGETS)
+
diff --git a/daemons/clvmd/clvm.h b/daemons/clvmd/clvm.h

new file mode 100644 (file)

index 0000000..dd20bfd
--- /dev/null
+++ b/daemons/clvmd/clvm.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* Definitions for CLVMD server and clients */
+
+/*
+ * The protocol spoken over the cluster and across the local socket.
+ */
+
+#ifndef _CLVM_H
+#define _CLVM_H
+
+struct clvm_header {
+       uint8_t  cmd;           /* See below */
+       uint8_t  flags;         /* See below */
+       uint16_t xid;           /* Transaction ID */
+       uint32_t clientid;      /* Only used in Daemon->Daemon comms */
+       int32_t  status;        /* For replies, whether request succeeded */
+       uint32_t arglen;        /* Length of argument below. 
+                                  If >1500 then it will be passed 
+                                  around the cluster in the system LV */
+       char node[1];           /* Actually a NUL-terminated string, node name.
+                                  If this is empty then the command is 
+                                  forwarded to all cluster nodes unless 
+                                  FLAG_LOCAL is also set. */
+       char args[1];           /* Arguments for the command follow the 
+                                  node name, This member is only
+                                  valid if the node name is empty */
+} __attribute__ ((packed));
+
+/* Flags */
+#define CLVMD_FLAG_LOCAL        1      /* Only do this on the local node */
+#define CLVMD_FLAG_SYSTEMLV     2      /* Data in system LV under my node name */
+
+/* Name of the local socket to communicate between libclvm and clvmd */
+//static const char CLVMD_SOCKNAME[]="/var/run/clvmd";
+static const char CLVMD_SOCKNAME[] = "\0clvmd";
+
+/* Internal commands & replies */
+#define CLVMD_CMD_REPLY    1
+#define CLVMD_CMD_VERSION  2   /* Send version around cluster when we start */
+#define CLVMD_CMD_GOAWAY   3   /* Die if received this - we are running 
+                                  an incompatible version */
+#define CLVMD_CMD_TEST     4   /* Just for mucking about */
+
+#define CLVMD_CMD_LOCK              30
+#define CLVMD_CMD_UNLOCK            31
+
+/* Lock/Unlock commands */
+#define CLVMD_CMD_LOCK_LV           50
+#define CLVMD_CMD_LOCK_VG           51
+
+#endif
diff --git a/daemons/clvmd/clvmd-cman.c b/daemons/clvmd/clvmd-cman.c

new file mode 100644 (file)

index 0000000..751f4dd
--- /dev/null
+++ b/daemons/clvmd/clvmd-cman.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * CMAN communication layer for clvmd.
+ */
+
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/un.h>
+#include <sys/time.h>
+#include <sys/ioctl.h>
+#include <sys/utsname.h>
+#include <syslog.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <signal.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <errno.h>
+
+#include "clvmd-comms.h"
+#include "clvm.h"
+#include "libdlm.h"
+#include "log.h"
+#include "clvmd.h"
+#include "lvm-functions.h"
+
+#define LOCKSPACE_NAME "clvmd"
+
+static int cluster_sock;
+static int num_nodes;
+static struct cl_cluster_node *nodes = NULL;
+static int count_nodes; /* size of allocated nodes array */
+static int max_updown_nodes = 50;      /* Current size of the allocated array */
+/* Node up/down status, indexed by nodeid */
+static int *node_updown = NULL;
+static dlm_lshandle_t *lockspace;
+
+static void sigusr1_handler(int sig);
+static void count_clvmds_running(void);
+static void get_members(void);
+static int nodeid_from_csid(char *csid);
+static int name_from_nodeid(int nodeid, char *name);
+
+struct lock_wait {
+       pthread_cond_t cond;
+       pthread_mutex_t mutex;
+       struct dlm_lksb lksb;
+};
+
+int init_cluster()
+{
+       struct sockaddr_cl saddr;
+       int port = CLUSTER_PORT_CLVMD;
+
+       /* Open the cluster communication socket */
+       cluster_sock = socket(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT);
+       if (cluster_sock == -1) {
+               perror("Can't open cluster socket");
+               return -1;
+       }
+
+       /* Bind to our port number on the cluster.
+          Writes to this will block if the cluster loses quorum */
+       saddr.scl_family = AF_CLUSTER;
+       saddr.scl_port = port;
+
+       if (bind
+           (cluster_sock, (struct sockaddr *) &saddr,
+            sizeof(struct sockaddr_cl))) {
+               log_error("Can't bind cluster socket: %m");
+               return -1;
+       }
+
+       /* Get the cluster members list */
+       get_members();
+       count_clvmds_running();
+
+       /* Create a lockspace for LV & VG locks to live in */
+       lockspace = dlm_create_lockspace(LOCKSPACE_NAME, 0600);
+       if (!lockspace) {
+               log_error("Unable to create lockspace for CLVM\n");
+               return -1;
+       }
+       dlm_ls_pthread_init(lockspace);
+       return 0;
+}
+
+int get_main_cluster_fd()
+{
+       return cluster_sock;
+}
+
+int get_num_nodes()
+{
+       return num_nodes;
+}
+
+/* send_message with the fd check removed */
+int cluster_send_message(void *buf, int msglen, char *csid, const char *errtext)
+{
+       struct iovec iov[2];
+       struct msghdr msg;
+       struct sockaddr_cl saddr;
+       int len = 0;
+
+       msg.msg_control = NULL;
+       msg.msg_controllen = 0;
+       msg.msg_iovlen = 1;
+       msg.msg_iov = iov;
+       msg.msg_flags = 0;
+       iov[0].iov_len = msglen;
+       iov[0].iov_base = buf;
+
+       saddr.scl_family = AF_CLUSTER;
+       saddr.scl_port = CLUSTER_PORT_CLVMD;
+       if (csid) {
+               msg.msg_name = &saddr;
+               msg.msg_namelen = sizeof(saddr);
+               memcpy(&saddr.scl_nodeid, csid, MAX_CSID_LEN);
+       } else {                /* Cluster broadcast */
+
+               msg.msg_name = NULL;
+               msg.msg_namelen = 0;
+       }
+
+       do {
+               len = sendmsg(cluster_sock, &msg, 0);
+               if (len < 0 && errno != EAGAIN)
+                       log_error(errtext);
+
+       } while (len == -1 && errno == EAGAIN);
+       return len;
+}
+
+void get_our_csid(char *csid)
+{
+       int i;
+       memset(csid, 0, MAX_CSID_LEN);
+
+       for (i = 0; i < num_nodes; i++) {
+               if (nodes[i].us)
+                       memcpy(csid, &nodes[i].node_id, MAX_CSID_LEN);
+       }
+}
+
+/* Call a callback routine for each node that known (down mean not running a clvmd) */
+int cluster_do_node_callback(struct local_client *client,
+                            void (*callback) (struct local_client *, char *,
+                                              int))
+{
+       int i;
+       int somedown = 0;
+
+       for (i = 0; i < get_num_nodes(); i++) {
+               callback(client, (char *)&nodes[i].node_id, node_updown[nodes[i].node_id]);
+               if (!node_updown[nodes[i].node_id])
+                       somedown = -1;
+       }
+       return somedown;
+}
+
+/* Process OOB message from the cluster socket,
+   this currently just means that a node has stopped listening on our port */
+static void process_oob_msg(char *buf, int len, int nodeid)
+{
+       char namebuf[256];
+       switch (buf[0]) {
+        case CLUSTER_OOB_MSG_PORTCLOSED:
+               name_from_nodeid(nodeid, namebuf);
+               log_notice("clvmd on node %s has died\n", namebuf);
+               DEBUGLOG("Got OOB message, removing node %s\n", namebuf);
+
+               node_updown[nodeid] = 0;
+               break;
+
+       case CLUSTER_OOB_MSG_STATECHANGE:
+               DEBUGLOG("Got OOB message, Cluster state change\n");
+               get_members();
+               break;
+       default:
+               /* ERROR */
+               DEBUGLOG("Got unknown OOB message: %d\n", buf[0]);
+       }
+}
+
+int cluster_fd_callback(struct local_client *fd, char *buf, int len, char *csid,
+                       struct local_client **new_client)
+{
+       struct iovec iov[2];
+       struct msghdr msg;
+       struct sockaddr_cl saddr;
+
+       /* We never return a new client */
+       *new_client = NULL;
+
+       msg.msg_control = NULL;
+       msg.msg_controllen = 0;
+       msg.msg_iovlen = 1;
+       msg.msg_iov = iov;
+       msg.msg_name = &saddr;
+       msg.msg_flags = 0;
+       msg.msg_namelen = sizeof(saddr);
+       iov[0].iov_len = len;
+       iov[0].iov_base = buf;
+
+       len = recvmsg(cluster_sock, &msg, MSG_OOB | O_NONBLOCK);
+       if (len < 0 && errno == EAGAIN)
+               return len;
+
+       DEBUGLOG("Read on cluster socket, len = %d\n", len);
+
+       /* A real error */
+       if (len < 0) {
+               log_error("read error on cluster socket: %m");
+               return 0;
+       }
+
+       /* EOF - we have left the cluster */
+       if (len == 0)
+               return 0;
+
+       /* Is it OOB? probably a node gone down */
+       if (msg.msg_flags & MSG_OOB) {
+               process_oob_msg(iov[0].iov_base, len, saddr.scl_nodeid);
+
+               /* Tell the upper layer to ignore this message */
+               len = -1;
+               errno = EAGAIN;
+       }
+       memcpy(csid, &saddr.scl_nodeid, sizeof(saddr.scl_nodeid));
+       return len;
+}
+
+void add_up_node(char *csid)
+{
+       /* It's up ! */
+       int nodeid = nodeid_from_csid(csid);
+
+       if (nodeid >= max_updown_nodes) {
+               int *new_updown = realloc(node_updown, max_updown_nodes + 10);
+
+               if (new_updown) {
+                       node_updown = new_updown;
+                       max_updown_nodes += 10;
+                       DEBUGLOG("realloced more space for nodes. now %d\n",
+                                max_updown_nodes);
+               } else {
+                       log_error
+                           ("Realloc failed. Node status for clvmd will be wrong\n");
+                       return;
+               }
+       }
+       node_updown[nodeid] = 1;
+       DEBUGLOG("Added new node %d to updown list\n", nodeid);
+}
+
+void cluster_closedown()
+{
+       unlock_all();
+       dlm_release_lockspace(LOCKSPACE_NAME, lockspace, 1);
+       close(cluster_sock);
+}
+
+static int is_listening(int nodeid)
+{
+       struct cl_listen_request rq;
+       int status;
+
+       rq.port = CLUSTER_PORT_CLVMD;
+       rq.nodeid = nodeid;
+
+       do {
+               status = ioctl(cluster_sock, SIOCCLUSTER_ISLISTENING, &rq);
+               if (status < 0 && errno == EBUSY) {     /* Don't busywait */
+                       sleep(1);
+                       errno = EBUSY;  /* In case sleep trashes it */
+               }
+       }
+       while (status < 0 && errno == EBUSY);
+
+       return status;
+}
+
+/* Populate the list of CLVMDs running.
+   called only at startup time */
+void count_clvmds_running(void)
+{
+       int i;
+
+       for (i = 0; i < num_nodes; i++) {
+               node_updown[nodes[i].node_id] = is_listening(nodes[i].node_id);
+       }
+}
+
+/* Get a list of active cluster members */
+static void get_members()
+{
+       struct cl_cluster_nodelist nodelist;
+
+       num_nodes = ioctl(cluster_sock, SIOCCLUSTER_GETMEMBERS, 0);
+       if (num_nodes == -1) {
+               perror("get nodes");
+       } else {
+               /* Not enough room for new nodes list ? */
+               if (num_nodes > count_nodes && nodes) {
+                       free(nodes);
+                       nodes = NULL;
+               }
+
+               if (nodes == NULL) {
+                       count_nodes = num_nodes + 10; /* Overallocate a little */
+                       nodes = malloc(count_nodes * sizeof(struct cl_cluster_node));
+                       if (!nodes) {
+                               perror("Unable to allocate nodes array\n");
+                               exit(5);
+                       }
+               }
+               nodelist.max_members = count_nodes;
+               nodelist.nodes = nodes;
+               
+               num_nodes = ioctl(cluster_sock, SIOCCLUSTER_GETMEMBERS, &nodelist);
+               if (num_nodes <= 0) {
+                       perror("get node details");
+                       exit(6);
+               }
+
+               /* Sanity check struct */
+               if (nodes[0].size != sizeof(struct cl_cluster_node)) {
+                       log_error
+                           ("sizeof(cl_cluster_node) does not match size returned from the kernel: aborting\n");
+                       exit(10);
+               }
+
+               if (node_updown == NULL) {
+                       node_updown =
+                           (int *) malloc(sizeof(int) *
+                                          max(num_nodes, max_updown_nodes));
+                       memset(node_updown, 0,
+                              sizeof(int) * max(num_nodes, max_updown_nodes));
+               }
+       }
+}
+
+/* Convert a node name to a CSID */
+int csid_from_name(char *csid, char *name)
+{
+       int i;
+
+       for (i = 0; i < num_nodes; i++) {
+               if (strcmp(name, nodes[i].name) == 0) {
+                       memcpy(csid, &nodes[i].node_id, MAX_CSID_LEN);
+                       return 0;
+               }
+       }
+       return -1;
+}
+
+/* Convert a CSID to a node name */
+int name_from_csid(char *csid, char *name)
+{
+       int i;
+
+       for (i = 0; i < num_nodes; i++) {
+               if (memcmp(csid, &nodes[i].node_id, MAX_CSID_LEN) == 0) {
+                       strcpy(name, nodes[i].name);
+                       return 0;
+               }
+       }
+       /* Who?? */
+       strcpy(name, "Unknown");
+       return -1;
+}
+
+/* Convert a node ID to a node name */
+int name_from_nodeid(int nodeid, char *name)
+{
+       int i;
+
+       for (i = 0; i < num_nodes; i++) {
+               if (nodeid == nodes[i].node_id) {
+                       strcpy(name, nodes[i].name);
+                       return 0;
+               }
+       }
+       /* Who?? */
+       strcpy(name, "Unknown");
+       return -1;
+}
+
+/* Convert a CSID to a node ID */
+static int nodeid_from_csid(char *csid)
+{
+        int nodeid;
+
+       memcpy(&nodeid, csid, MAX_CSID_LEN);
+
+       return nodeid;
+}
+
+int is_quorate()
+{
+       return ioctl(cluster_sock, SIOCCLUSTER_ISQUORATE, 0);
+}
+
+static void sync_ast_routine(void *arg)
+{
+       struct lock_wait *lwait = arg;
+
+       pthread_mutex_lock(&lwait->mutex);
+       pthread_cond_signal(&lwait->cond);
+       pthread_mutex_unlock(&lwait->mutex);
+}
+
+int sync_lock(const char *resource, int mode, int flags, int *lockid)
+{
+       int status;
+       struct lock_wait lwait;
+
+       if (!lockid) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       /* Conversions need the lockid in the LKSB */
+       if (flags & LKF_CONVERT)
+               lwait.lksb.sb_lkid = *lockid;
+
+       pthread_cond_init(&lwait.cond, NULL);
+       pthread_mutex_init(&lwait.mutex, NULL);
+       pthread_mutex_lock(&lwait.mutex);
+
+       status = dlm_ls_lock(lockspace,
+                            mode,
+                            &lwait.lksb,
+                            flags,
+                            resource,
+                            strlen(resource),
+                            0, sync_ast_routine, &lwait, NULL, NULL);
+       if (status)
+               return status;
+
+       /* Wait for it to complete */
+       pthread_cond_wait(&lwait.cond, &lwait.mutex);
+       pthread_mutex_unlock(&lwait.mutex);
+
+       *lockid = lwait.lksb.sb_lkid;
+
+       errno = lwait.lksb.sb_status;
+       if (lwait.lksb.sb_status)
+               return -1;
+       else
+               return 0;
+}
+
+int sync_unlock(const char *resource /* UNUSED */, int lockid)
+{
+       int status;
+       struct lock_wait lwait;
+
+       pthread_cond_init(&lwait.cond, NULL);
+       pthread_mutex_init(&lwait.mutex, NULL);
+       pthread_mutex_lock(&lwait.mutex);
+
+       status = dlm_ls_unlock(lockspace, lockid, 0, &lwait.lksb, &lwait);
+
+       if (status)
+               return status;
+
+       /* Wait for it to complete */
+       pthread_cond_wait(&lwait.cond, &lwait.mutex);
+       pthread_mutex_unlock(&lwait.mutex);
+
+       errno = lwait.lksb.sb_status;
+       if (lwait.lksb.sb_status != EUNLOCK)
+               return -1;
+       else
+               return 0;
+
+}
diff --git a/daemons/clvmd/clvmd-command.c b/daemons/clvmd/clvmd-command.c

new file mode 100644 (file)

index 0000000..517c134
--- /dev/null
+++ b/daemons/clvmd/clvmd-command.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+
+  CLVMD Cluster LVM daemon command processor.
+
+  To add commands to the daemon simply add a processor in do_command and return
+  and messages back in buf and the length in *retlen. The initial value of
+  buflen is the maximum size of the buffer. if buf is not large enough then it
+  may be reallocated by the functions in here to a suitable size bearing in
+  mind that anything larger than the passed-in size will have to be returned
+  using the system LV and so performance will suffer.
+
+  The status return will be negated and passed back to the originating node.
+
+  pre- and post- command routines are called only on the local node. The
+  purpose is primarily to get and release locks, though the pre- routine should
+  also do any other local setups required by the command (if any) and can
+  return a failure code that prevents the command from being distributed around
+  the cluster
+
+  The pre- and post- routines are run in their own thread so can block as long
+  they like, do_command is run in the main clvmd thread so should not block for
+  too long. If the pre-command returns an error code (!=0) then the command
+  will not be propogated around the cluster but the post-command WILL be called
+
+  Also note that the pre and post routine are *always* called on the local
+  node, even if the command to be executed was only requested to run on a
+  remote node. It may peek inside the client structure to check the status of
+  the command.
+
+  The clients of the daemon must, naturally, understand the return messages and
+  codes.
+
+  Routines in here may only READ the values in the client structure passed in
+  apart from client->private which they are free to do what they like with.
+
+*/
+
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "list.h"
+#include "locking.h"
+#include "log.h"
+#include "lvm-functions.h"
+#include "clvmd-comms.h"
+#include "clvm.h"
+#include "clvmd.h"
+#include "libdlm.h"
+
+/* This is where all the real work happens:
+   NOTE: client will be NULL when this is executed on a remote node */
+int do_command(struct local_client *client, struct clvm_header *msg, int msglen,
+              char **buf, int buflen, int *retlen)
+{
+       char *args = msg->node + strlen(msg->node) + 1;
+       int arglen = msglen - sizeof(struct clvm_header) - strlen(msg->node);
+       int status = 0;
+       char *lockname;
+       struct utsname nodeinfo;
+       unsigned char lock_cmd;
+       unsigned char lock_flags;
+
+       /* Do the command */
+       switch (msg->cmd) {
+               /* Just a test message */
+       case CLVMD_CMD_TEST:
+               if (arglen > buflen) {
+                       buflen = arglen + 200;
+                       *buf = realloc(*buf, buflen);
+               }
+               uname(&nodeinfo);
+               *retlen = 1 + snprintf(*buf, buflen, "TEST from %s: %s v%s",
+                                      nodeinfo.nodename, args,
+                                      nodeinfo.release);
+               break;
+
+       case CLVMD_CMD_LOCK_VG:
+               /* Check to see if the VG is in use by LVM1 */
+               status = do_check_lvm1(&args[2]);
+               break;
+
+       case CLVMD_CMD_LOCK_LV:
+               /* This is the biggie */
+               lock_cmd = args[0];
+               lock_flags = args[1];
+               lockname = &args[2];
+               status = do_lock_lv(lock_cmd, lock_flags, lockname);
+               /* Replace EIO with something less scary */
+               if (status == EIO) {
+                       *retlen =
+                           1 + snprintf(*buf, buflen,
+                                        "Internal lvm error, check syslog");
+                       return EIO;
+               }
+               break;
+
+       default:
+               /* Won't get here because command is validated in pre_command */
+               break;
+       }
+
+       /* Check the status of the command and return the error text */
+       if (status) {
+               *retlen = 1 + snprintf(*buf, buflen, strerror(status));
+       }
+
+       return status;
+
+}
+
+/* Pre-command is a good place to get locks that are needed only for the duration
+   of the commands around the cluster (don't forget to free them in post-command),
+   and to sanity check the command arguments */
+int do_pre_command(struct local_client *client)
+{
+       struct clvm_header *header =
+           (struct clvm_header *) client->bits.localsock.cmd;
+       unsigned char lock_cmd;
+       unsigned char lock_flags;
+       char *args = header->node + strlen(header->node) + 1;
+       int lockid;
+       int status = 0;
+       char *lockname;
+
+       switch (header->cmd) {
+       case CLVMD_CMD_TEST:
+               status = sync_lock("CLVMD_TEST", LKM_EXMODE, 0, &lockid);
+               client->bits.localsock.private = (void *) lockid;
+               break;
+
+       case CLVMD_CMD_LOCK_VG:
+               lock_cmd = args[0];
+               lock_flags = args[1];
+               lockname = &args[2];
+               DEBUGLOG("doing PRE command LOCK_VG %s at %x\n", lockname,
+                        lock_cmd);
+               if (lock_cmd == LCK_UNLOCK) {
+                       hold_unlock(lockname);
+               } else {
+                       status =
+                           hold_lock(lockname, (int) lock_cmd,
+                                     (int) lock_flags);
+                       if (status)
+                               status = errno;
+               }
+               break;
+
+       case CLVMD_CMD_LOCK_LV:
+               lock_cmd = args[0];
+               lock_flags = args[1];
+               lockname = &args[2];
+               status = pre_lock_lv(lock_cmd, lock_flags, lockname);
+               break;
+
+       default:
+               log_error("Unknown command %d received\n", header->cmd);
+               status = EINVAL;
+       }
+       return status;
+}
+
+/* Note that the post-command routine is called even if the pre-command or the real command
+   failed */
+int do_post_command(struct local_client *client)
+{
+       struct clvm_header *header =
+           (struct clvm_header *) client->bits.localsock.cmd;
+       int status = 0;
+       unsigned char lock_cmd;
+       unsigned char lock_flags;
+       char *args = header->node + strlen(header->node) + 1;
+       char *lockname;
+
+       switch (header->cmd) {
+       case CLVMD_CMD_TEST:
+               status =
+                   sync_unlock("CLVMD_TEST", (int) (long) client->bits.localsock.private);
+               break;
+
+       case CLVMD_CMD_LOCK_VG:
+               /* Nothing to do here */
+               break;
+
+       case CLVMD_CMD_LOCK_LV:
+               lock_cmd = args[0];
+               lock_flags = args[1];
+               lockname = &args[2];
+               status = post_lock_lv(lock_cmd, lock_flags, lockname);
+               break;
+       }
+       return status;
+}
diff --git a/daemons/clvmd/clvmd-comms.h b/daemons/clvmd/clvmd-comms.h

new file mode 100644 (file)

index 0000000..54017b3
--- /dev/null
+++ b/daemons/clvmd/clvmd-comms.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * Abstraction layer for clvmd cluster communications
+ */
+
+#ifndef _CLVMD_COMMS_H
+#define _CLVMD_COMMS_H
+
+struct local_client;
+
+extern int cluster_send_message(void *buf, int msglen, char *csid,
+                               const char *errtext);
+extern int name_from_csid(char *csid, char *name);
+extern int csid_from_name(char *csid, char *name);
+extern int get_num_nodes(void);
+extern int cluster_fd_callback(struct local_client *fd, char *buf, int len,
+                              char *csid, struct local_client **new_client);
+extern int init_cluster(void);
+extern int get_main_cluster_fd(void);  /* gets accept FD or cman cluster socket */
+extern int cluster_do_node_callback(struct local_client *client,
+                                   void (*callback) (struct local_client *,
+                                                     char *csid, int node_up));
+extern int is_quorate(void);
+
+extern void get_our_csid(char *csid);
+extern void add_up_node(char *csid);
+extern void cluster_closedown(void);
+
+extern int sync_lock(const char *resource, int mode, int flags, int *lockid);
+extern int sync_unlock(const char *resource, int lockid);
+
+#ifdef USE_GULM
+#include "tcp-comms.h"
+#else
+/* cman */
+#include "cnxman-socket.h"
+#define MAX_CSID_LEN 4
+#endif
+
+
+#endif
diff --git a/daemons/clvmd/clvmd-gulm.c b/daemons/clvmd/clvmd-gulm.c

new file mode 100644 (file)

index 0000000..bef4cbe
--- /dev/null
+++ b/daemons/clvmd/clvmd-gulm.c
@@ -0,0 +1,880 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  2002-2003  All rights reserved.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* This provides the interface between clvmd and gulm as the cluster
+ * and lock manager.
+ *
+ * It also provides the "liblm" functions too as it's hard (and pointless)
+ * to seperate them out when using gulm.
+ *
+ * What it does /not/ provide is the communications between clvmd daemons
+ * on the cluster nodes. That is done in tcp-comms.c
+ */
+
+
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <utmpx.h>
+#include <syslog.h>
+#include <assert.h>
+
+#include "ccs.h"
+#include "list.h"
+#include "locking.h"
+#include "log.h"
+#include "clvm.h"
+#include "clvmd-comms.h"
+#include "clvmd.h"
+#include "hash.h"
+#include "clvmd-gulm.h"
+#include "libgulm.h"
+#include "hash.h"
+
+/* Hash list of nodes in the cluster */
+static struct hash_table *node_hash;
+
+/* hash list of outstanding lock requests */
+static struct hash_table *lock_hash;
+
+/* Copy of the current core state */
+static uint8_t current_corestate;
+
+/* Number of active nodes */
+static int num_nodes;
+
+static char *cluster_name;
+
+static pthread_mutex_t lock_start_mutex;
+static volatile int lock_start_flag;
+
+struct node_info
+{
+    enum {NODE_UNKNOWN, NODE_DOWN, NODE_UP, NODE_CLVMD} state;
+    char name[MAX_CLUSTER_MEMBER_NAME_LEN];
+};
+
+struct lock_wait
+{
+    pthread_cond_t cond;
+    pthread_mutex_t mutex;
+    int status;
+};
+
+/* Forward */
+static int read_from_core_sock(struct local_client *client, char *buf, int len, char *csid,
+                              struct local_client **new_client);
+static int read_from_lock_sock(struct local_client *client, char *buf, int len, char *csid,
+                              struct local_client **new_client);
+static int get_all_cluster_nodes(void);
+
+/* In tcp-comms.c */
+extern struct hash_table *sock_hash;
+
+static int add_internal_client(int fd, fd_callback_t callback)
+{
+    struct local_client *client;
+
+    DEBUGLOG("Add_internal_client, fd = %d\n", fd);
+
+    /* Add a GULM file descriptor it to the main loop */
+    client = malloc(sizeof(struct local_client));
+    if (!client)
+    {
+       DEBUGLOG("malloc failed\n");
+       return -1;
+    }
+
+    memset(client, 0, sizeof(struct local_client));
+    client->fd = fd;
+    client->type = CLUSTER_INTERNAL;
+    client->callback = callback;
+    add_client(client);
+
+    return 0;
+}
+
+/* Gulm library handle */
+static gulm_interface_p gulm_if;
+static lg_core_callbacks_t core_callbacks;
+static lg_lockspace_callbacks_t lock_callbacks;
+
+static void badsig_handler(int sig)
+{
+    DEBUGLOG("got sig %d\n", sig);
+    cluster_closedown();
+    exit(0);
+}
+
+static void sighup_handler(int sig)
+{
+    DEBUGLOG("got SIGHUP\n");
+
+    /* Re-read CCS node list */
+    get_all_cluster_nodes();
+}
+
+int init_cluster()
+{
+    int status;
+    int ccs_h;
+
+    /* Get cluster name from CCS */
+    /* TODO: is this right? */
+    ccs_h = ccs_connect();
+    ccs_get(ccs_h, "//cluster/@name", &cluster_name);
+    ccs_disconnect(ccs_h);
+
+    /* Block locking until we are logged in */
+    pthread_mutex_init(&lock_start_mutex, NULL);
+    pthread_mutex_lock(&lock_start_mutex);
+    lock_start_flag = 1;
+
+    node_hash = hash_create(100);
+    lock_hash = hash_create(10);
+
+    /* Get all nodes from CCS */
+    get_all_cluster_nodes();
+
+    /* Initialise GULM library */
+    status = lg_initialize(&gulm_if, cluster_name, "clvmd");
+    if (status)
+    {
+       DEBUGLOG("lg_initialize failed: %d\n", status);
+       return status;
+    }
+
+    /* Connect to core - we are not "important" :-) */
+    status = lg_core_login(gulm_if, 0);
+    if (status)
+    {
+       DEBUGLOG("lg_core_login failed: %d\n", status);
+       return status;
+    }
+
+    /* Initialise the inter-node comms */
+    status = init_comms();
+    if (status)
+       return status;
+
+    /* Add core FD to the list */
+    status = add_internal_client(lg_core_selector(gulm_if), read_from_core_sock);
+    if (status)
+    {
+       DEBUGLOG("can't allocate client space\n");
+       return status;
+    }
+
+    /* Connect to the lock server */
+    if (lg_lock_login(gulm_if, "CLVM"))
+    {
+       syslog(LOG_ERR, "Cannot login in to LOCK server\n");
+       DEBUGLOG("Cannot login in to LOCK server\n");
+       exit(88);
+    }
+
+    /* Add lockspace FD to the list */
+    status = add_internal_client(lg_lock_selector(gulm_if), read_from_lock_sock);
+    if (status)
+    {
+       DEBUGLOG("can't allocate client space\n");
+       exit(status);
+    }
+
+    /* Request a list of nodes, we can;t really do anything until
+       this comes back */
+    status = lg_core_nodelist(gulm_if);
+    if (status)
+    {
+       DEBUGLOG("lg_core_nodelist failed: %d\n", status);
+       return status;
+    }
+
+    /* So I can kill it without taking GULM down too */
+    signal(SIGINT, badsig_handler);
+    signal(SIGTERM, badsig_handler);
+
+    /* Re-read the node list on SIGHUP */
+    signal(SIGHUP, sighup_handler);
+
+    return 0;
+}
+
+void cluster_closedown()
+{
+    DEBUGLOG("cluster_closedown\n");
+    lg_lock_logout(gulm_if);
+    lg_core_logout(gulm_if);
+    lg_core_shutdown(gulm_if);
+    lg_release(gulm_if);
+}
+
+/* Expire locks for a named node, or us */
+#define GIO_KEY_SIZE 46
+static void drop_expired_locks(char *nodename)
+{
+    struct utsname nodeinfo;
+    uint8_t mask[GIO_KEY_SIZE];
+
+    memset(mask, 0xff, GIO_KEY_SIZE);
+
+    if (!nodename)
+    {
+       uname(&nodeinfo);
+       nodename = nodeinfo.nodename;
+    }
+
+    if (lg_lock_drop_exp(gulm_if, nodename, mask, GIO_KEY_SIZE))
+    {
+       DEBUGLOG("Error calling lg_lock_drop_exp()\n");
+    }
+}
+
+
+static int read_from_core_sock(struct local_client *client, char *buf, int len, char *csid,
+                              struct local_client **new_client)
+{
+    int status;
+
+    *new_client = NULL;
+    status = lg_core_handle_messages(gulm_if, &core_callbacks, NULL);
+    return status<0 ? status : 1;
+}
+
+static int read_from_lock_sock(struct local_client *client, char *buf, int len, char *csid,
+                              struct local_client **new_client)
+{
+    int status;
+
+    *new_client = NULL;
+    status = lg_lock_handle_messages(gulm_if, &lock_callbacks, NULL);
+    return status<0 ? status : 1;
+}
+
+
+/* CORE callback routines */
+static int core_login_reply(void *misc, uint64_t gen, uint32_t error, uint32_t rank, uint8_t corestate)
+{
+   DEBUGLOG("CORE Got a Login reply.  gen:%lld err:%d rank:%d corestate:%d\n",
+         gen, error, rank, corestate);
+
+   if (error)
+       exit(error);
+
+   current_corestate = corestate;
+   return 0;
+}
+
+static void set_node_state(struct node_info *ninfo, char *csid, uint8_t nodestate)
+{
+    if (nodestate == lg_core_Logged_in)
+    {
+       /* Don't clobber NODE_CLVMD state */
+       if (ninfo->state != NODE_CLVMD)
+       {
+           if (ninfo->state == NODE_UNKNOWN ||
+               ninfo->state == NODE_DOWN)
+               num_nodes++;
+
+           ninfo->state = NODE_UP;
+       }
+    }
+    else
+    {
+       if (nodestate == lg_core_Expired ||
+           nodestate == lg_core_Fenced ||
+           nodestate == lg_core_Logged_out)
+       {
+           if (ninfo->state != NODE_DOWN)
+               num_nodes--;
+           ninfo->state = NODE_DOWN;
+           tcp_remove_client(csid);
+       }
+    }
+    DEBUGLOG("set_node_state, '%s' state = %d, num_nodes=%d\n",
+            ninfo->name, ninfo->state, num_nodes);
+}
+
+static struct node_info *add_or_set_node(char *name, uint32_t ip, uint8_t state)
+{
+    struct node_info *ninfo;
+
+    ninfo = hash_lookup_binary(node_hash, (char *)&ip, MAX_CSID_LEN);
+    if (!ninfo)
+    {
+       /* If we can't find that node then re-read the config file in case it
+          was added after we were started */
+       DEBUGLOG("Node %s not found, re-reading config file\n", name);
+       get_all_cluster_nodes();
+
+       /* Now try again */
+       ninfo = hash_lookup_binary(node_hash, (char *)&ip, MAX_CSID_LEN);
+       if (!ninfo)
+       {
+           DEBUGLOG("Ignoring node %s, not part of the SAN cluster\n", name);
+           return NULL;
+       }
+    }
+
+    set_node_state(ninfo, (char *)&ip, state);
+
+    return ninfo;
+}
+
+static int core_nodelist(void *misc, lglcb_t type, char *name, uint32_t ip, uint8_t state)
+{
+    DEBUGLOG("CORE nodelist\n");
+
+    if (type == lglcb_start)
+    {
+       DEBUGLOG("Got Nodelist, start\n");
+    }
+    else
+    {
+       if (type == lglcb_item)
+       {
+           DEBUGLOG("Got nodelist, item: %s, %#x, %#x\n", name, ip, state);
+
+           add_or_set_node(name, ip, state);
+       }
+       else
+       {
+           if (type == lglcb_stop)
+           {
+               char ourcsid[MAX_CSID_LEN];
+
+               DEBUGLOG("Got Nodelist, stop\n");
+               clvmd_cluster_init_completed();
+
+               /* Mark ourself as up */
+               get_our_csid(ourcsid);
+               add_up_node(ourcsid);
+           }
+           else
+           {
+               DEBUGLOG("Unknown lglcb_t %#x\n", type);
+           }
+       }
+    }
+
+    return 0;
+}
+
+static int core_statechange(void *misc, uint8_t corestate, uint32_t masterip, char *mastername)
+{
+    DEBUGLOG("CORE Got statechange  corestate:%#x masterip:%#x mastername:%s\n",
+            corestate, masterip, mastername);
+
+    current_corestate = corestate;
+    return 0;
+}
+
+static int core_nodechange(void *misc, char *nodename, uint32_t nodeip, uint8_t nodestate)
+{
+    struct node_info *ninfo;
+
+    DEBUGLOG("CORE node change, name=%s, ip=%x, state = %d\n", nodename, nodeip, nodestate);
+
+    /* If we don't get nodeip here, try a lookup by name */
+    if (!nodeip)
+       csid_from_name((char *)&nodeip, nodename);
+    if (!nodeip)
+       return 0;
+
+    ninfo = add_or_set_node(nodename, nodeip, nodestate);
+    if (!ninfo)
+       return 0;
+
+    /* Check if we need to drop any expired locks */
+    if (ninfo->state == NODE_DOWN)
+    {
+       drop_expired_locks(nodename);
+    }
+
+    return 0;
+}
+static int core_error(void *misc, uint32_t err)
+{
+    DEBUGLOG("CORE error: %d\n", err);
+    // Not sure what happens here
+    return 0;
+}
+
+/* LOCK callback routines */
+static int lock_login_reply(void *misc, uint32_t error, uint8_t which)
+{
+    DEBUGLOG("LOCK Got a Login reply.  err:%d which:%d\n",
+            error, which);
+
+    if (error)
+       exit(error);
+
+    /* Drop any expired locks for us that might be hanging around */
+    drop_expired_locks(NULL);
+
+    /* Enable locking operations in other threads */
+    if (lock_start_flag)
+    {
+       lock_start_flag = 0;
+       pthread_mutex_unlock(&lock_start_mutex);
+    }
+
+    return 0;
+}
+
+static int lock_lock_state(void *misc, uint8_t *key, uint16_t keylen, uint8_t state, uint32_t flags, uint32_t error,
+                          uint8_t *LVB, uint16_t LVBlen)
+{
+    struct lock_wait *lwait;
+
+    DEBUGLOG("LOCK lock state: %s, error = %d\n", key, error);
+
+    lwait = hash_lookup(lock_hash, key);
+    if (!lwait)
+    {
+       DEBUGLOG("Can't find hash entry for resource %s\n", key);
+       return 0;
+    }
+    lwait->status = error;
+    pthread_mutex_lock(&lwait->mutex);
+    pthread_cond_signal(&lwait->cond);
+    pthread_mutex_unlock(&lwait->mutex);
+
+    return 0;
+}
+static int lock_error(void *misc, uint32_t err)
+{
+    DEBUGLOG("LOCK error: %d\n", err);
+    // Not sure what happens here
+    return 0;
+}
+
+
+/* CORE callbacks */
+static lg_core_callbacks_t core_callbacks = {
+    .login_reply  = core_login_reply,
+    .nodelist     = core_nodelist,
+    .statechange  = core_statechange,
+    .nodechange   = core_nodechange,
+    .error        = core_error,
+};
+
+/* LOCK callbacks */
+static lg_lockspace_callbacks_t lock_callbacks = {
+    .login_reply   = lock_login_reply,
+    .lock_state    = lock_lock_state,
+    .error         = lock_error,
+};
+
+/* Allow tcp-comms to loop round the list of active nodes */
+int get_next_node_csid(void **context, char *csid)
+{
+    struct node_info *ninfo = NULL;
+
+    /* First node */
+    if (!*context)
+    {
+       *context = hash_get_first(node_hash);
+    }
+    else
+    {
+       *context = hash_get_next(node_hash, *context);
+    }
+    if (*context)
+       ninfo = hash_get_data(node_hash, *context);
+
+    /* Find a node that is UP */
+    while (*context && ninfo->state == NODE_DOWN)
+    {
+       *context = hash_get_next(node_hash, *context);
+       if (*context)
+       {
+           ninfo = hash_get_data(node_hash, *context);
+       }
+    }
+
+    if (!*context || ninfo->state == NODE_DOWN)
+    {
+       return 0;
+    }
+
+    memcpy(csid, hash_get_key(node_hash, *context), MAX_CSID_LEN);
+    return 1;
+}
+
+int name_from_csid(char *csid, char *name)
+{
+    struct node_info *ninfo;
+
+    ninfo = hash_lookup_binary(node_hash, csid, MAX_CSID_LEN);
+    if (!ninfo)
+    {
+       sprintf(name, "UNKNOWN [%d.%d.%d.%d]",
+               csid[0], csid[1], csid[2], csid[3]);
+       return -1;
+    }
+
+    strcpy(name, ninfo->name);
+    return 0;
+}
+
+
+int csid_from_name(char *csid, char *name)
+{
+    struct hash_node *hn;
+    struct node_info *ninfo;
+
+    hash_iterate(hn, node_hash)
+    {
+       ninfo = hash_get_data(node_hash, hn);
+       if (strcmp(ninfo->name, name) == 0)
+       {
+           memcpy(csid, hash_get_key(node_hash, hn), MAX_CSID_LEN);
+           return 0;
+       }
+    }
+    return -1;
+}
+
+int get_num_nodes()
+{
+    DEBUGLOG("num_nodes = %d\n", num_nodes);
+    return num_nodes;
+}
+
+/* Node is now known to be running a clvmd */
+void add_up_node(char *csid)
+{
+    struct node_info *ninfo;
+
+    ninfo = hash_lookup_binary(node_hash, csid, MAX_CSID_LEN);
+    if (!ninfo)
+       return;
+
+    ninfo->state = NODE_CLVMD;
+    return;
+
+}
+/* Node is now known to be NOT running a clvmd */
+void add_down_node(char *csid)
+{
+    struct node_info *ninfo;
+
+    ninfo = hash_lookup_binary(node_hash, csid, MAX_CSID_LEN);
+    if (!ninfo)
+       return;
+
+    /* Only set it to UP if it was previously known to be
+       running clvmd - gulm may set it DOWN quite soon */
+    if (ninfo->state == NODE_CLVMD)
+       ninfo->state = NODE_UP;
+    return;
+
+}
+
+/* Call a callback for each node, so the caller knows whether it's up or down */
+int cluster_do_node_callback(struct local_client *master_client,
+                            void (*callback)(struct local_client *, char *csid, int node_up))
+{
+    struct hash_node *hn;
+    struct node_info *ninfo;
+
+    hash_iterate(hn, node_hash)
+    {
+       char csid[MAX_CSID_LEN];
+       struct local_client *client;
+
+       ninfo = hash_get_data(node_hash, hn);
+       memcpy(csid, hash_get_key(node_hash, hn), MAX_CSID_LEN);
+
+       DEBUGLOG("down_callback. node %s, state = %d\n", ninfo->name, ninfo->state);
+
+       client = hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN);
+       if (client)
+           callback(master_client, csid, ninfo->state == NODE_CLVMD);
+    }
+    return 0;
+}
+
+/* Convert gulm error codes to unix errno numbers */
+static int gulm_to_errno(int gulm_ret)
+{
+    switch (gulm_ret)
+    {
+    case lg_err_TryFailed:
+       errno = EAGAIN;
+       break;
+
+    case lg_err_AlreadyPend:
+       errno = EBUSY;
+
+       /* More?? */
+    default:
+       errno = EINVAL;
+    }
+
+    return gulm_ret ? -1 : 0;
+}
+
+/* Real locking */
+static int _lock_resource(char *resource, int mode, int flags, int *lockid)
+{
+    int status;
+    struct lock_wait lwait;
+
+    /* Wait until the lock module is ready */
+    if (lock_start_flag)
+    {
+       pthread_mutex_lock(&lock_start_mutex);
+       pthread_mutex_unlock(&lock_start_mutex);
+    }
+
+    pthread_cond_init(&lwait.cond, NULL);
+    pthread_mutex_init(&lwait.mutex, NULL);
+    pthread_mutex_lock(&lwait.mutex);
+
+    /* This needs to be converted from DLM/LVM2 value for GULM */
+    if (flags == LCK_NONBLOCK) flags = lg_lock_flag_Try;
+
+    hash_insert(lock_hash, resource, &lwait);
+    DEBUGLOG("lock_resource '%s', flags=%d, mode=%d\n", resource, flags, mode);
+
+    status = lg_lock_state_req(gulm_if, resource, strlen(resource)+1,
+                              mode, flags, NULL, 0);
+    if (status)
+    {
+       DEBUGLOG("lg_lock_state returned %d\n", status);
+       return status;
+    }
+
+    /* Wait for it to complete */
+    pthread_cond_wait(&lwait.cond, &lwait.mutex);
+    pthread_mutex_unlock(&lwait.mutex);
+
+    hash_remove(lock_hash, resource);
+    DEBUGLOG("lock-resource returning %d\n", lwait.status);
+
+    return gulm_to_errno(lwait.status);
+}
+
+
+static int _unlock_resource(char *resource, int lockid)
+{
+    int status;
+    struct lock_wait lwait;
+
+    pthread_cond_init(&lwait.cond, NULL);
+    pthread_mutex_init(&lwait.mutex, NULL);
+    pthread_mutex_lock(&lwait.mutex);
+
+    hash_insert(lock_hash, resource, &lwait);
+
+    DEBUGLOG("unlock_resource %s\n", resource);
+    status = lg_lock_state_req(gulm_if, resource, strlen(resource)+1,
+                              lg_lock_state_Unlock, 0, NULL, 0);
+
+    if (status)
+    {
+       DEBUGLOG("lg_lock_state(unlock) returned %d\n", status);
+       return status;
+    }
+
+    /* Wait for it to complete */
+
+    pthread_cond_wait(&lwait.cond, &lwait.mutex);
+    pthread_mutex_unlock(&lwait.mutex);
+
+    hash_remove(lock_hash, resource);
+
+    return gulm_to_errno(lwait.status);
+}
+
+
+/* These two locking functions MUST be called in a seperate thread from
+   the clvmd main loop because they expect to be woken up by it.
+
+   These are abstractions around the real locking functions (above)
+   as we need to emulate the DLM's EX/PW/CW interaction with GULM using
+   two locks.
+   To aid unlocking, we store the lock mode in the lockid (as GULM
+   doesn't use this).
+*/
+int sync_lock(const char *resource, int mode, int flags, int *lockid)
+{
+    int status;
+    char lock1[strlen(resource)+3];
+    char lock2[strlen(resource)+3];
+
+    snprintf(lock1, sizeof(lock1), "%s-1", resource);
+    snprintf(lock2, sizeof(lock2), "%s-2", resource);
+
+    switch (mode)
+    {
+    case LCK_EXCL:
+       status = _lock_resource(lock1, lg_lock_state_Exclusive, flags, lockid);
+       if (status)
+           goto out;
+
+       /* If we can't get this lock then bail out */
+       status = _lock_resource(lock2, lg_lock_state_Exclusive, LCK_NONBLOCK, lockid);
+        if (status == lg_err_TryFailed)
+        {
+           _unlock_resource(lock1, *lockid);
+           status = -1;
+           errno = EAGAIN;
+        }
+       break;
+
+    case LCK_READ:
+       status = _lock_resource(lock1, lg_lock_state_Shared, flags, lockid);
+       break;
+
+    case LCK_WRITE:
+       status = _lock_resource(lock2, lg_lock_state_Exclusive, flags, lockid);
+       break;
+
+    default:
+       status = -1;
+       errno = EINVAL;
+       break;
+    }
+ out:
+    *lockid = mode;
+    return status;
+}
+
+int sync_unlock(const char *resource, int lockid)
+{
+    int status = 0;
+    char lock1[strlen(resource)+3];
+    char lock2[strlen(resource)+3];
+
+    snprintf(lock1, sizeof(lock1), "%s-1", resource);
+    snprintf(lock2, sizeof(lock2), "%s-2", resource);
+
+    /* The held lock mode is in the lock id */
+    assert(lockid == LCK_EXCL ||
+          lockid == LCK_READ ||
+          lockid == LCK_WRITE);
+
+    switch (lockid)
+    {
+    case LCK_EXCL:
+       status = _unlock_resource(lock1, lockid);
+       if (status)
+           goto out;
+       status = _unlock_resource(lock2, lockid);
+       break;
+
+    case LCK_READ:
+       status = _unlock_resource(lock1, lockid);
+       break;
+
+    case LCK_WRITE:
+       status = _unlock_resource(lock2, lockid);
+       break;
+    }
+
+ out:
+    return status;
+}
+
+int is_quorate()
+{
+    if (current_corestate == lg_core_Slave ||
+       current_corestate == lg_core_Master ||
+       current_corestate == lg_core_Client)
+       return 1;
+    else
+       return 0;
+}
+
+/* Get all the cluster node names & IPs from CCS and
+   add them to our node list so we know who to talk to.
+   Called when we start up and if we get sent SIGHUP.
+*/
+static int get_all_cluster_nodes()
+{
+    int ctree;
+    char *nodename;
+    int error;
+
+    /* Open the config file */
+    ctree = ccs_connect();
+    if (ctree <= 0)
+    {
+       log_error("Error connecting to CCS");
+       return -1;
+    }
+
+    error = ccs_get(ctree, "//nodes/node/@name", &nodename);
+    while (nodename)
+    {
+       char nodeip[MAX_CSID_LEN];
+       char *clvmflag;
+       char key[256];
+
+       sprintf(key, "//nodes/node[@name=\"%s\"]/clvm", nodename);
+       ccs_get(ctree, key, &clvmflag);
+
+       if ((get_ip_address(nodename, nodeip) == 0) && atoi(clvmflag))
+       {
+           struct node_info *ninfo;
+
+           /* If it's not in the list, then add it */
+           ninfo = hash_lookup_binary(node_hash, nodeip, MAX_CSID_LEN);
+           if (!ninfo)
+           {
+               ninfo = malloc(sizeof(struct node_info));
+               if (!ninfo)
+               {
+                   syslog(LOG_ERR, "Cannot alloc memory for node info\n");
+                   ccs_disconnect(ctree);
+                   return -1;
+               }
+               strcpy(ninfo->name, nodename);
+
+               ninfo->state = NODE_DOWN;
+               hash_insert_binary(node_hash, nodeip, MAX_CSID_LEN, ninfo);
+           }
+       }
+       else
+       {
+           DEBUGLOG("node %s has clvm disabled\n", nodename);
+       }
+       if (clvmflag) free(clvmflag);
+       free(nodename);
+       error = ccs_get(ctree, "//nodes/node/@name", &nodename);
+    }
+
+    /* Finished with config file */
+    ccs_disconnect(ctree);
+
+    return 0;
+}
+
+int gulm_fd(void)
+{
+    return lg_core_selector(gulm_if);
+}
diff --git a/daemons/clvmd/clvmd-gulm.h b/daemons/clvmd/clvmd-gulm.h

new file mode 100644 (file)

index 0000000..07726fa
--- /dev/null
+++ b/daemons/clvmd/clvmd-gulm.h
@@ -0,0 +1,9 @@
+
+
+
+extern int get_next_node_csid(void **context, char *csid);
+extern void add_down_node(char *csid);
+extern int gulm_fd(void);
+extern int get_ip_address(char *node, char *addr);
+extern void tcp_remove_client(char *csid);
+extern int alloc_client(int fd, char *csid, struct local_client **new_client);
diff --git a/daemons/clvmd/clvmd.c b/daemons/clvmd/clvmd.c

new file mode 100644 (file)

index 0000000..216eb1e
--- /dev/null
+++ b/daemons/clvmd/clvmd.c
@@ -0,0 +1,1693 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * CLVMD: Cluster LVM daemon
+ */
+
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/un.h>
+#include <sys/time.h>
+#include <sys/ioctl.h>
+#include <sys/utsname.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <signal.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <errno.h>
+
+#include "clvmd-comms.h"
+#include "lvm-functions.h"
+#include "clvm.h"
+#include "clvmd.h"
+#include "libdlm.h"
+#include "system-lv.h"
+#include "list.h"
+#include "log.h"
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/* The maximum size of a message that will fit into a packet. Anything bigger
+   than this is sent via the system LV */
+#define MAX_INLINE_MESSAGE (MAX_CLUSTER_MESSAGE-sizeof(struct clvm_header))
+
+#define ISLOCAL_CSID(c) (memcmp(c, our_csid, MAX_CSID_LEN) == 0)
+
+/* Head of the fd list. Also contains
+   the cluster_socket details */
+static struct local_client local_client_head;
+
+static unsigned short global_xid = 0;  /* Last transaction ID issued */
+
+static char our_csid[MAX_CSID_LEN];
+
+/* Structure of items on the LVM thread list */
+struct lvm_thread_cmd {
+       struct list list;
+
+       struct local_client *client;
+       struct clvm_header *msg;
+       char csid[MAX_CSID_LEN];
+       int remote;             /* Flag */
+       int msglen;
+       unsigned short xid;
+};
+static pthread_t lvm_thread;
+static pthread_mutex_t lvm_thread_mutex;
+static pthread_cond_t lvm_thread_cond;
+static struct list lvm_cmd_head;
+static int quit = 0;
+
+/* Prototypes for code further down */
+static void sigusr2_handler(int sig);
+static void sigterm_handler(int sig);
+static void send_local_reply(struct local_client *client, int status,
+                            int clientid);
+static void free_reply(struct local_client *client);
+static void send_version_message(void);
+static void *pre_and_post_thread(void *arg);
+static int send_message(void *buf, int msglen, char *csid, int fd,
+                       const char *errtext);
+static int read_from_local_sock(struct local_client *thisfd);
+static int process_local_command(struct clvm_header *msg, int msglen,
+                                struct local_client *client,
+                                unsigned short xid);
+static void process_remote_command(struct clvm_header *msg, int msglen, int fd,
+                                  char *csid);
+static int process_reply(struct clvm_header *msg, int msglen, char *csid);
+static int open_local_sock(void);
+static struct local_client *find_client(int clientid);
+static void main_loop(int local_sock, int cmd_timeout);
+static void be_daemon(void);
+static int check_all_clvmds_running(struct local_client *client);
+static int local_rendezvous_callback(struct local_client *thisfd, char *buf,
+                                    int len, char *csid,
+                                    struct local_client **new_client);
+static void *lvm_thread_fn(void *);
+static int add_to_lvmqueue(struct local_client *client, struct clvm_header *msg,
+                          int msglen, char *csid);
+static int distribute_command(struct local_client *thisfd);
+static void hton_clvm(struct clvm_header *hdr);
+static void ntoh_clvm(struct clvm_header *hdr);
+static void add_reply_to_list(struct local_client *client, int status,
+                             char *csid, const char *buf, int len);
+
+static void usage(char *prog, FILE *file)
+{
+       fprintf(file, "Usage:\n");
+       fprintf(file, "%s [Vhd]\n", prog);
+       fprintf(file, "\n");
+       fprintf(file, "   -V       Show version of clvmd\n");
+       fprintf(file, "   -h       Show this help information\n");
+       fprintf(file, "   -d       Don't fork, run in the foreground\n");
+       fprintf(file, "   -t<secs> Command timeout (default 60 seconds)\n");
+       fprintf(file, "\n");
+}
+
+int main(int argc, char *argv[])
+{
+       int local_sock;
+       struct local_client *newfd;
+       struct utsname nodeinfo;
+       signed char opt;
+       int debug = 0;
+       int cmd_timeout = DEFAULT_CMD_TIMEOUT;
+       sigset_t ss;
+
+       /* Deal with command-line arguments */
+       opterr = 0;
+       optind = 0;
+       while ((opt = getopt(argc, argv, "?vVhdt:")) != EOF) {
+               switch (opt) {
+               case 'h':
+                       usage(argv[0], stdout);
+                       exit(0);
+
+               case '?':
+                       usage(argv[0], stderr);
+                       exit(0);
+
+               case 'd':
+                       debug++;
+                       break;
+
+               case 't':
+                       cmd_timeout = atoi(optarg);
+                       if (!cmd_timeout) {
+                               fprintf(stderr, "command timeout is invalid\n");
+                               usage(argv[0], stderr);
+                               exit(1);
+                       }
+                       break;
+
+               case 'V':
+                       printf("\nCluster LVM Daemon version %d.%d.%d\n\n",
+                              CLVMD_MAJOR_VERSION, CLVMD_MINOR_VERSION,
+                              CLVMD_PATCH_VERSION);
+                       exit(1);
+                       break;
+
+               }
+       }
+
+       /* Fork into the background (unless requested not to) */
+       if (!debug) {
+               be_daemon();
+       }
+
+       DEBUGLOG("CLVMD started\n");
+
+       /* Open the Unix socket we listen for commands on.
+          We do this before opening the cluster socket so that
+          potential clients will block rather than error if we are running
+          but the cluster is not ready yet */
+       local_sock = open_local_sock();
+       if (local_sock < 0)
+               exit(2);
+
+       /* Set up signal handlers, USR1 is for cluster change notifications (in cman)
+          USR2 causes child threads to exit.
+          PIPE should be ignored */
+       signal(SIGUSR2, sigusr2_handler);
+       signal(SIGTERM, sigterm_handler);
+       signal(SIGINT, sigterm_handler);
+       signal(SIGPIPE, SIG_IGN);
+
+       /* Block SIGUSR2 in the main process */
+       sigemptyset(&ss);
+       sigaddset(&ss, SIGUSR2);
+       sigprocmask(SIG_BLOCK, &ss, NULL);
+
+       /* Initialise the LVM thread variables */
+       list_init(&lvm_cmd_head);
+       pthread_mutex_init(&lvm_thread_mutex, NULL);
+       pthread_cond_init(&lvm_thread_cond, NULL);
+       init_lvhash();
+
+       /* Start the cluster interface */
+       if (init_cluster()) {
+               DEBUGLOG("Can't initialise cluster interface\n");
+               log_error("Can't initialise cluster interface\n");
+               exit(5);
+       }
+       DEBUGLOG("Cluster ready, doing some more initialisation\n");
+
+       /* Save our CSID */
+       uname(&nodeinfo);
+       get_our_csid(our_csid);
+
+       /* Initialise the FD list head */
+       local_client_head.fd = get_main_cluster_fd();
+       local_client_head.type = CLUSTER_MAIN_SOCK;
+       local_client_head.callback = cluster_fd_callback;
+
+       /* Add the local socket to the list */
+       newfd = malloc(sizeof(struct local_client));
+       if (!newfd)
+               exit(2);
+
+       newfd->fd = local_sock;
+       newfd->type = LOCAL_RENDEZVOUS;
+       newfd->callback = local_rendezvous_callback;
+       newfd->next = local_client_head.next;
+       local_client_head.next = newfd;
+
+       /* This needs to be started after cluster initialisation
+          as it may need to take out locks */
+       DEBUGLOG("starting LVM thread\n");
+       pthread_create(&lvm_thread, NULL, lvm_thread_fn, nodeinfo.nodename);
+
+#ifndef USE_GULM
+       /* Tell the rest of the cluster our version number */
+       /* CMAN can do this immediately, gulm needs to wait until
+          the core initialisation has finished and the node list
+          has been gathered */
+       send_version_message();
+#endif
+
+       DEBUGLOG("clvmd ready for work\n");
+
+       /* Do some work */
+       main_loop(local_sock, cmd_timeout);
+
+       return 0;
+}
+
+/* Called when the GuLM cluster layer has completed initialisation.
+   We send the version message */
+void clvmd_cluster_init_completed()
+{
+       send_version_message();
+}
+
+/* Data on a connected socket */
+static int local_sock_callback(struct local_client *thisfd, char *buf, int len,
+                              char *csid, struct local_client **new_client)
+{
+       *new_client = NULL;
+       return read_from_local_sock(thisfd);
+}
+
+/* Data on a connected socket */
+static int local_rendezvous_callback(struct local_client *thisfd, char *buf,
+                                    int len, char *csid,
+                                    struct local_client **new_client)
+{
+       /* Someone connected to our local socket, accept it. */
+
+       struct sockaddr_un socka;
+       struct local_client *newfd;
+       socklen_t sl = sizeof(socka);
+       int client_fd = accept(thisfd->fd, (struct sockaddr *) &socka, &sl);
+
+       if (client_fd >= 0) {
+               newfd = malloc(sizeof(struct local_client));
+               if (!newfd) {
+                       close(client_fd);
+                       return 1;
+               }
+               newfd->fd = client_fd;
+               newfd->type = LOCAL_SOCK;
+               newfd->xid = 0;
+               newfd->callback = local_sock_callback;
+               newfd->bits.localsock.replies = NULL;
+               newfd->bits.localsock.expected_replies = 0;
+               newfd->bits.localsock.cmd = NULL;
+               newfd->bits.localsock.in_progress = FALSE;
+               newfd->bits.localsock.sent_out = FALSE;
+               newfd->bits.localsock.threadid = 0;
+               newfd->bits.localsock.finished = 0;
+               newfd->bits.localsock.pipe_client = NULL;
+               newfd->bits.localsock.all_success = 1;
+               DEBUGLOG("Got new connection on fd %d\n", newfd->fd);
+               *new_client = newfd;
+       }
+       return 1;
+}
+
+static int local_pipe_callback(struct local_client *thisfd, char *buf,
+                              int maxlen, char *csid,
+                              struct local_client **new_client)
+{
+       int len;
+       char buffer[PIPE_BUF];
+       struct local_client *sock_client = thisfd->bits.pipe.client;
+       int status = -1;        /* in error by default */
+
+       len = read(thisfd->fd, buffer, sizeof(int));
+
+       DEBUGLOG("read on PIPE %d: %d bytes: status: %d\n",
+                thisfd->fd, len, *(int *) buffer);
+
+       if (len == sizeof(int)) {
+               status = *(int *) buffer;
+       }
+
+       /* EOF on pipe or an error, close it */
+       if (len <= 0) {
+               int jstat;
+               close(thisfd->fd);
+
+               /* Clear out the cross-link */
+               if (thisfd->bits.pipe.client != NULL)
+                       thisfd->bits.pipe.client->bits.localsock.pipe_client =
+                           NULL;
+
+               /* Reap child thread */
+               if (thisfd->bits.pipe.threadid) {
+                       jstat =
+                           pthread_join(thisfd->bits.pipe.threadid,
+                                        (void **) &status);
+                       thisfd->bits.pipe.threadid = 0;
+                       if (thisfd->bits.pipe.client != NULL)
+                               thisfd->bits.pipe.client->bits.localsock.
+                                   threadid = 0;
+               }
+               return -1;
+       } else {
+               DEBUGLOG("background routine status was %d, sock_client=%p\n",
+                        status, sock_client);
+               /* But has the client gone away ?? */
+               if (sock_client == NULL) {
+                       DEBUGLOG
+                           ("Got PIPE response for dead client, ignoring it\n");
+               } else {
+                       /* If error then just return that code */
+                       if (status)
+                               send_local_reply(sock_client, status,
+                                                sock_client->fd);
+                       else {
+                               if (sock_client->bits.localsock.state ==
+                                   POST_COMMAND) {
+                                       send_local_reply(sock_client, 0,
+                                                        sock_client->fd);
+                               } else  // PRE_COMMAND finished.
+                               {
+                                       if (
+                                           (status =
+                                            distribute_command(sock_client)) !=
+                                           0) send_local_reply(sock_client,
+                                                               EFBIG,
+                                                               sock_client->
+                                                               fd);
+                               }
+                       }
+               }
+       }
+       return len;
+}
+
+/* If a noed is up, look for it in the reply array, if it's not there then
+   add one with "ETIMEDOUT".
+   NOTE: This won't race with real replies because they happen in the same thread.
+*/
+static void timedout_callback(struct local_client *client, char *csid,
+                             int node_up)
+{
+       if (node_up) {
+               struct node_reply *reply;
+               char nodename[MAX_CLUSTER_MEMBER_NAME_LEN];
+
+               name_from_csid(csid, nodename);
+               DEBUGLOG("PJC: checking for a reply from %s\n", nodename);
+               pthread_mutex_lock(&client->bits.localsock.reply_mutex);
+
+               reply = client->bits.localsock.replies;
+               while (reply && strcmp(reply->node, nodename) != 0) {
+                       reply = reply->next;
+               }
+
+               pthread_mutex_unlock(&client->bits.localsock.reply_mutex);
+
+               if (!reply) {
+                       DEBUGLOG("PJC: node %s timed-out\n", nodename);
+                       add_reply_to_list(client, ETIMEDOUT, csid,
+                                         "Command timed out", 18);
+               }
+       }
+}
+
+/* Called when the request has timed out on at least one node. We fill in
+   the remaining node entries with ETIMEDOUT and return.
+
+   By the time we get here the node that caused
+   the timeout could have gone down, in which case we will never get the expected
+   number of replies that triggers the post command so we need to do it here
+*/
+static void request_timed_out(struct local_client *client)
+{
+       DEBUGLOG("Request timed-out. padding\n");
+       cluster_do_node_callback(client, timedout_callback);
+
+       if (client->bits.localsock.num_replies !=
+           client->bits.localsock.expected_replies) {
+               /* Post-process the command */
+               if (client->bits.localsock.threadid) {
+                       pthread_mutex_lock(&client->bits.localsock.mutex);
+                       client->bits.localsock.state = POST_COMMAND;
+                       pthread_cond_signal(&client->bits.localsock.cond);
+                       pthread_mutex_unlock(&client->bits.localsock.mutex);
+               }
+       }
+}
+
+/* This is where the real work happens */
+static void main_loop(int local_sock, int cmd_timeout)
+{
+       DEBUGLOG("Using timeout of %d seconds\n", cmd_timeout);
+
+       /* Main loop */
+       while (!quit) {
+               fd_set in;
+               int select_status;
+               struct local_client *thisfd;
+               struct timeval tv = { cmd_timeout, 0 };
+               int quorate = is_quorate();
+
+               /* Wait on the cluster FD and all local sockets/pipes */
+               FD_ZERO(&in);
+               for (thisfd = &local_client_head; thisfd != NULL;
+                    thisfd = thisfd->next) {
+                       /* if the cluster is not quorate then don't listen for new requests */
+                       if ((thisfd->type != LOCAL_RENDEZVOUS &&
+                            thisfd->type != LOCAL_SOCK) || quorate)
+                               FD_SET(thisfd->fd, &in);
+               }
+
+               if ((select_status = select(FD_SETSIZE, &in, NULL, NULL, &tv)) > 0) {
+                       struct local_client *lastfd = NULL;
+                       struct clvm_header *inheader;
+                       char csid[MAX_CSID_LEN];
+                       char buf[MAX_CLUSTER_MESSAGE];
+
+                       for (thisfd = &local_client_head; thisfd != NULL;
+                            thisfd = thisfd->next) {
+                               if (FD_ISSET(thisfd->fd, &in)) {
+                                       struct local_client *newfd;
+                                       int ret;
+
+                                       /* Do callback */
+                                       ret =
+                                           thisfd->callback(thisfd, buf,
+                                                            sizeof(buf), csid,
+                                                            &newfd);
+                                       /* Ignore EAGAIN */
+                                       if (ret < 0 && (errno == EAGAIN ||
+                                                       errno == EINTR)) continue;
+
+                                       /* Got error or EOF: Remove it from the list safely */
+                                       if (ret <= 0) {
+                                               struct local_client *free_fd;
+                                               int type = thisfd->type;
+
+                                               /* If the cluster socket shuts down, so do we */
+                                               if (type == CLUSTER_MAIN_SOCK ||
+                                                   type == CLUSTER_INTERNAL)
+                                                       goto closedown;
+
+                                               DEBUGLOG
+                                                   ("ret == %d, errno = %d. removing client\n",
+                                                    ret, errno);
+                                               lastfd->next = thisfd->next;
+                                               free_fd = thisfd;
+                                               thisfd = lastfd;
+                                               free(free_fd);
+                                               break;
+                                       }
+
+                                       /* New client...simply add it to the list */
+                                       if (newfd) {
+                                               newfd->next = thisfd->next;
+                                               thisfd->next = newfd;
+                                               break;
+                                       }
+
+                                       switch (thisfd->type) {
+                                       case CLUSTER_MAIN_SOCK:
+                                       case CLUSTER_DATA_SOCK:
+                                               inheader =
+                                                   (struct clvm_header *) buf;
+                                               ntoh_clvm(inheader);    /* Byteswap fields */
+                                               if (inheader->cmd ==
+                                                   CLVMD_CMD_REPLY)
+                                                           process_reply
+                                                           (inheader, ret,
+                                                            csid);
+                                               else
+                                                       add_to_lvmqueue(thisfd,
+                                                                       inheader,
+                                                                       ret,
+                                                                       csid);
+                                               break;
+
+                                               /* All the work for these is done in the callback
+                                                  rightly or wrongly... */
+                                       case LOCAL_RENDEZVOUS:
+                                       case LOCAL_SOCK:
+                                       case THREAD_PIPE:
+                                       case CLUSTER_INTERNAL:
+                                               break;
+                                       }
+                               }
+                               lastfd = thisfd;
+                       }
+               }
+
+               /* Select timed out. Check for clients that have been waiting too long for a response */
+               if (select_status == 0) {
+                       time_t the_time = time(NULL);
+
+                       for (thisfd = &local_client_head; thisfd != NULL;
+                            thisfd = thisfd->next) {
+                               if (thisfd->type == LOCAL_SOCK
+                                   && thisfd->bits.localsock.sent_out
+                                   && thisfd->bits.localsock.sent_time +
+                                   cmd_timeout < the_time
+                                   && thisfd->bits.localsock.
+                                   expected_replies !=
+                                   thisfd->bits.localsock.num_replies) {
+                                       /* Send timed out message + replies we already have */
+                                       DEBUGLOG
+                                           ("Request timed-out (send: %ld, now: %ld)\n",
+                                            thisfd->bits.localsock.sent_time,
+                                            the_time);
+
+                                       thisfd->bits.localsock.all_success = 0;
+
+                                       request_timed_out(thisfd);
+                               }
+                       }
+               }
+               if (select_status < 0) {
+                       if (errno == EINTR)
+                               continue;
+
+#ifdef DEBUG
+                       perror("select error");
+                       exit(-1);
+#endif
+               }
+       }
+
+      closedown:
+       cluster_closedown();
+       close(local_sock);
+}
+
+/* Fork into the background and detach from our parent process */
+static void be_daemon()
+{
+       pid_t pid;
+       int devnull = open("/dev/null", O_RDWR);
+       if (devnull == -1) {
+               perror("Can't open /dev/null");
+               exit(3);
+       }
+
+       switch (pid = fork()) {
+       case -1:
+               perror("clvmd: can't fork");
+               exit(2);
+
+       case 0:         /* child */
+               break;
+
+       default:                /* Parent */
+               exit(0);
+       }
+
+       /* Detach ourself from the calling environment */
+       if (close(0) || close(1) || close(2)) {
+               perror("Error closing terminal FDs");
+               exit(4);
+       }
+       setsid();
+
+       if (dup2(devnull, 0) < 0 || dup2(devnull, 1) < 0
+           || dup2(devnull, 2) < 0) {
+               perror("Error setting terminal FDs to /dev/null");
+               log_error("Error setting terminal FDs to /dev/null: %m");
+               exit(5);
+       }
+       if (chdir("/")) {
+               log_error("Error setting current directory to /: %m");
+               exit(6);
+       }
+
+}
+
+/* Called when we have a read from the local socket.
+   was in the main loop but it's grown up and is a big girl now */
+static int read_from_local_sock(struct local_client *thisfd)
+{
+       int len;
+       int argslen;
+       int missing_len;
+       char buffer[PIPE_BUF];
+
+       len = read(thisfd->fd, buffer, sizeof(buffer));
+
+       DEBUGLOG("Read on local socket %d, len = %d\n", thisfd->fd, len);
+
+       /* EOF or error on socket */
+       if (len <= 0) {
+               int *status;
+               int jstat;
+
+               DEBUGLOG("EOF on local socket: inprogress=%d\n",
+                        thisfd->bits.localsock.in_progress);
+
+               thisfd->bits.localsock.finished = 1;
+
+               /* If the client went away in mid command then tidy up */
+               if (thisfd->bits.localsock.in_progress) {
+                       pthread_mutex_lock(&thisfd->bits.localsock.mutex);
+                       thisfd->bits.localsock.state = POST_COMMAND;
+                       pthread_cond_signal(&thisfd->bits.localsock.cond);
+                       pthread_mutex_unlock(&thisfd->bits.localsock.mutex);
+
+                       /* Free any unsent buffers */
+                       free_reply(thisfd);
+               }
+
+               /* Kill the subthread & free resources */
+               if (thisfd->bits.localsock.threadid) {
+                       DEBUGLOG("Waiting for child thread\n");
+                       pthread_mutex_lock(&thisfd->bits.localsock.mutex);
+                       thisfd->bits.localsock.state = POST_COMMAND;
+                       pthread_cond_signal(&thisfd->bits.localsock.cond);
+                       pthread_mutex_unlock(&thisfd->bits.localsock.mutex);
+                       pthread_kill(thisfd->bits.localsock.threadid, SIGUSR2);
+
+                       jstat =
+                           pthread_join(thisfd->bits.localsock.threadid,
+                                        (void **) &status);
+                       DEBUGLOG("Joined child thread\n");
+
+                       thisfd->bits.localsock.threadid = 0;
+                       pthread_cond_destroy(&thisfd->bits.localsock.cond);
+                       pthread_mutex_destroy(&thisfd->bits.localsock.mutex);
+
+                       /* Remove the pipe client */
+                       if (thisfd->bits.localsock.pipe_client != NULL) {
+                               struct local_client *newfd;
+                               struct local_client *lastfd = NULL;
+                               struct local_client *free_fd = NULL;
+
+                               close(thisfd->bits.localsock.pipe_client->fd);  /* Close pipe */
+                               close(thisfd->bits.localsock.pipe);
+
+                               /* Remove pipe client */
+                               for (newfd = &local_client_head; newfd != NULL;
+                                    newfd = newfd->next) {
+                                       if (thisfd->bits.localsock.
+                                           pipe_client == newfd) {
+                                               thisfd->bits.localsock.
+                                                   pipe_client = NULL;
+
+                                               lastfd->next = newfd->next;
+                                               free_fd = newfd;
+                                               newfd->next = lastfd;
+                                               free(free_fd);
+                                               break;
+                                       }
+                                       lastfd = newfd;
+                               }
+                       }
+               }
+
+               /* Free the command buffer */
+               if (thisfd->bits.localsock.cmd)
+                       free(thisfd->bits.localsock.cmd);
+
+               /* Clear out the cross-link */
+               if (thisfd->bits.localsock.pipe_client != NULL)
+                       thisfd->bits.localsock.pipe_client->bits.pipe.client =
+                           NULL;
+
+               close(thisfd->fd);
+               return 0;
+       } else {
+               int comms_pipe[2];
+               struct local_client *newfd;
+               char csid[MAX_CSID_LEN];
+               struct clvm_header *inheader;
+
+               inheader = (struct clvm_header *) buffer;
+
+               /* Fill in the client ID */
+               inheader->clientid = htonl(thisfd->fd);
+
+               /* If we are already busy then return an error */
+               if (thisfd->bits.localsock.in_progress) {
+                       struct clvm_header reply;
+                       reply.cmd = CLVMD_CMD_REPLY;
+                       reply.status = -EBUSY;
+                       reply.arglen = 0;
+                       reply.flags = 0;
+                       send_message(&reply, sizeof(reply), our_csid,
+                                    thisfd->fd,
+                                    "Error sending EBUSY reply to local user");
+                       return len;
+               }
+
+               /* Free any old buffer space */
+               if (thisfd->bits.localsock.cmd)
+                       free(thisfd->bits.localsock.cmd);
+
+               /* See if we have the whole message */
+               argslen =
+                   len - strlen(inheader->node) - sizeof(struct clvm_header);
+               missing_len = inheader->arglen - argslen;
+
+               /* Save the message */
+               thisfd->bits.localsock.cmd = malloc(len + missing_len);
+               if (!thisfd->bits.localsock.cmd) {
+                       struct clvm_header reply;
+                       reply.cmd = CLVMD_CMD_REPLY;
+                       reply.status = -ENOMEM;
+                       reply.arglen = 0;
+                       reply.flags = 0;
+                       send_message(&reply, sizeof(reply), our_csid,
+                                    thisfd->fd,
+                                    "Error sending ENOMEM reply to local user");
+                       return 0;
+               }
+               memcpy(thisfd->bits.localsock.cmd, buffer, len);
+               thisfd->bits.localsock.cmd_len = len + missing_len;
+               inheader = (struct clvm_header *) thisfd->bits.localsock.cmd;
+
+               /* If we don't have the full message then read the rest now */
+               if (missing_len) {
+                       char *argptr =
+                           inheader->node + strlen(inheader->node) + 1;
+
+                       while (missing_len > 0 && len >= 0) {
+                               DEBUGLOG
+                                   ("got %d bytes, need another %d (total %d)\n",
+                                    argslen, missing_len, inheader->arglen);
+                               len =
+                                   read(thisfd->fd, argptr + argslen,
+                                        missing_len);
+                               if (len >= 0) {
+                                       missing_len -= len;
+                                       argslen += len;
+                               }
+                       }
+               }
+
+               /* Only run the command if all the cluster nodes are running CLVMD */
+               if (((inheader->flags & CLVMD_FLAG_LOCAL) == 0) &&
+                   (check_all_clvmds_running(thisfd) == -1)) {
+                       thisfd->bits.localsock.expected_replies = 0;
+                       thisfd->bits.localsock.num_replies = 0;
+                       send_local_reply(thisfd, EHOSTDOWN, thisfd->fd);
+                       return len;
+               }
+
+               /* Check the node name for validity */
+               if (inheader->node[0] && csid_from_name(csid, inheader->node)) {
+                       /* Error, node is not in the cluster */
+                       struct clvm_header reply;
+                       DEBUGLOG("Unknown node: '%s'\n", inheader->node);
+
+                       reply.cmd = CLVMD_CMD_REPLY;
+                       reply.status = -ENOENT;
+                       reply.flags = 0;
+                       reply.arglen = 0;
+                       send_message(&reply, sizeof(reply), our_csid,
+                                    thisfd->fd,
+                                    "Error sending ENOENT reply to local user");
+                       thisfd->bits.localsock.expected_replies = 0;
+                       thisfd->bits.localsock.num_replies = 0;
+                       thisfd->bits.localsock.in_progress = FALSE;
+                       thisfd->bits.localsock.sent_out = FALSE;
+                       return len;
+               }
+
+               /* If we already have a subthread then just signal it to start */
+               if (thisfd->bits.localsock.threadid) {
+                       pthread_mutex_lock(&thisfd->bits.localsock.mutex);
+                       thisfd->bits.localsock.state = PRE_COMMAND;
+                       pthread_cond_signal(&thisfd->bits.localsock.cond);
+                       pthread_mutex_unlock(&thisfd->bits.localsock.mutex);
+                       return len;
+               }
+
+               /* Create a pipe and add the reading end to our FD list */
+               pipe(comms_pipe);
+               newfd = malloc(sizeof(struct local_client));
+               if (!newfd) {
+                       struct clvm_header reply;
+                       close(comms_pipe[0]);
+                       close(comms_pipe[1]);
+
+                       reply.cmd = CLVMD_CMD_REPLY;
+                       reply.status = -ENOMEM;
+                       reply.arglen = 0;
+                       reply.flags = 0;
+                       send_message(&reply, sizeof(reply), our_csid,
+                                    thisfd->fd,
+                                    "Error sending ENOMEM reply to local user");
+                       return len;
+               }
+               DEBUGLOG("creating pipe, [%d, %d]\n", comms_pipe[0],
+                        comms_pipe[1]);
+               newfd->fd = comms_pipe[0];
+               newfd->type = THREAD_PIPE;
+               newfd->callback = local_pipe_callback;
+               newfd->next = thisfd->next;
+               newfd->bits.pipe.client = thisfd;
+               newfd->bits.pipe.threadid = 0;
+               thisfd->next = newfd;
+
+               /* Store a cross link to the pipe */
+               thisfd->bits.localsock.pipe_client = newfd;
+
+               thisfd->bits.localsock.pipe = comms_pipe[1];
+
+               /* Initialise and lock the mutex so the subthread will wait after
+                  finishing the PRE routine */
+               pthread_mutex_init(&thisfd->bits.localsock.mutex, NULL);
+               pthread_cond_init(&thisfd->bits.localsock.cond, NULL);
+               pthread_mutex_init(&thisfd->bits.localsock.reply_mutex, NULL);
+
+               /* Make sure the thread has a copy of it's own ID */
+               newfd->bits.pipe.threadid = thisfd->bits.localsock.threadid;
+
+               /* Run the pre routine */
+               thisfd->bits.localsock.in_progress = TRUE;
+               thisfd->bits.localsock.state = PRE_COMMAND;
+               pthread_create(&thisfd->bits.localsock.threadid, NULL,
+                              pre_and_post_thread, thisfd);
+       }
+       return len;
+}
+
+/* Add a file descriptor from the cluster or comms interface to
+   our list of FDs for select
+*/
+int add_client(struct local_client *new_client)
+{
+       new_client->next = local_client_head.next;
+       local_client_head.next = new_client;
+
+       return 0;
+}
+
+
+/*
+ * Send a long message using the System LV
+ */
+static int send_long_message(struct local_client *thisfd, struct clvm_header *inheader, int len)
+{
+    struct clvm_header new_header;
+    int status;
+
+    DEBUGLOG("Long message: being sent via system LV:\n");
+
+    /* Use System LV */
+    status = system_lv_write_data((char *)inheader, len);
+    if (status < 0)
+           return errno;
+
+    /* Send message indicating System-LV is being used */
+    memcpy(&new_header, inheader, sizeof(new_header));
+    new_header.flags |= CLVMD_FLAG_SYSTEMLV;
+    new_header.xid = thisfd->xid;
+
+    return send_message(&new_header, sizeof(new_header), NULL, -1,
+                "Error forwarding long message to cluster");
+}
+
+/* Called when the pre-command has completed successfully - we
+   now execute the real command on all the requested nodes */
+static int distribute_command(struct local_client *thisfd)
+{
+       struct clvm_header *inheader =
+           (struct clvm_header *) thisfd->bits.localsock.cmd;
+       int len = thisfd->bits.localsock.cmd_len;
+
+       thisfd->xid = global_xid++;
+       DEBUGLOG("distribute command: XID = %d\n", thisfd->xid);
+
+       /* Forward it to other nodes in the cluster if needed */
+       if (!(inheader->flags & CLVMD_FLAG_LOCAL)) {
+               /* if node is empty then do it on the whole cluster */
+               if (inheader->node[0] == '\0') {
+                       thisfd->bits.localsock.expected_replies =
+                           get_num_nodes();
+                       thisfd->bits.localsock.num_replies = 0;
+                       thisfd->bits.localsock.sent_time = time(NULL);
+                       thisfd->bits.localsock.in_progress = TRUE;
+                       thisfd->bits.localsock.sent_out = TRUE;
+
+                       /* Do it here first */
+                       add_to_lvmqueue(thisfd, inheader, len, NULL);
+
+                       DEBUGLOG("Sending message to all cluster nodes\n");
+                       if (len > MAX_INLINE_MESSAGE) {
+                               send_long_message(thisfd, inheader, len );
+                       } else {
+                               inheader->xid = thisfd->xid;
+                               send_message(inheader, len, NULL, -1,
+                                            "Error forwarding message to cluster");
+                       }
+               } else {
+                        /* Do it on a single node */
+                       char csid[MAX_CSID_LEN];
+
+                       if (csid_from_name(csid, inheader->node)) {
+                               /* This has already been checked so should not happen */
+                               return 0;
+                       } else {
+                               /* OK, found a node... */
+                               thisfd->bits.localsock.expected_replies = 1;
+                               thisfd->bits.localsock.num_replies = 0;
+                               thisfd->bits.localsock.in_progress = TRUE;
+
+                               /* Are we the requested node ?? */
+                               if (memcmp(csid, our_csid, MAX_CSID_LEN) == 0) {
+                                       DEBUGLOG("Doing command on local node only\n");
+                                       add_to_lvmqueue(thisfd, inheader, len, NULL);
+                               } else {
+                                       DEBUGLOG("Sending message to single node: %s\n",
+                                                inheader->node);
+                                       if (len > MAX_INLINE_MESSAGE) {
+                                               send_long_message(thisfd, inheader, len );
+                                       } else {
+                                               inheader->xid = thisfd->xid;
+                                               send_message(inheader, len,
+                                                            csid, -1,
+                                                            "Error forwarding message to cluster node");
+                                       }
+                               }
+                       }
+               }
+       } else {
+               /* Local explicitly requested, ignore nodes */
+               thisfd->bits.localsock.in_progress = TRUE;
+               thisfd->bits.localsock.expected_replies = 1;
+               thisfd->bits.localsock.num_replies = 0;
+               add_to_lvmqueue(thisfd, inheader, len, NULL);
+       }
+       return 0;
+}
+
+/* Process a command from a remote node and return the result */
+void process_remote_command(struct clvm_header *msg, int msglen, int fd,
+                           char *csid)
+{
+       char *replyargs;
+       char nodename[MAX_CLUSTER_MEMBER_NAME_LEN];
+       int replylen = 0;
+       int buflen = MAX_CLUSTER_MESSAGE - sizeof(struct clvm_header) - 1;
+       int status;
+       int msg_malloced = 0;
+
+       /* Get the node name as we /may/ need it later */
+       name_from_csid(csid, nodename);
+
+       DEBUGLOG("process_remote_command %d for clientid 0x%x on node %s\n",
+                msg->cmd, msg->clientid, nodename);
+
+       /* Is the data to be found in the system LV ? */
+       if (msg->flags & CLVMD_FLAG_SYSTEMLV) {
+               struct clvm_header *newmsg;
+
+               DEBUGLOG("Reading message from system LV\n");
+               newmsg =
+                   (struct clvm_header *) malloc(msg->arglen +
+                                                 sizeof(struct clvm_header));
+               if (newmsg) {
+                       if (system_lv_read_data
+                           (nodename, (char *) newmsg,
+                            (size_t *) &msglen) == 0) {
+                               msg = newmsg;
+                               msg_malloced = 1;
+                       } else {
+                               struct clvm_header head;
+                               DEBUGLOG("System LV read failed\n");
+
+                               /* Return a failure response */
+                               head.cmd = CLVMD_CMD_REPLY;
+                               head.status = -EFBIG;
+                               head.flags = 0;
+                               head.clientid = msg->clientid;
+                               head.arglen = 0;
+                               head.node[0] = '\0';
+                               send_message(&head, sizeof(struct clvm_header),
+                                            csid, fd,
+                                            "Error sending ENOMEM command reply");
+                               return;
+                       }
+               } else {
+                       struct clvm_header head;
+                       DEBUGLOG
+                           ("Error attempting to malloc %d bytes for system LV read\n",
+                            msg->arglen);
+                       /* Return a failure response */
+                       head.cmd = CLVMD_CMD_REPLY;
+                       head.status = -ENOMEM;
+                       head.flags = 0;
+                       head.clientid = msg->clientid;
+                       head.arglen = 0;
+                       head.node[0] = '\0';
+                       send_message(&head, sizeof(struct clvm_header), csid,
+                                    fd, "Error sending ENOMEM command reply");
+                       return;
+               }
+       }
+
+       /* Check for GOAWAY and sulk */
+       if (msg->cmd == CLVMD_CMD_GOAWAY) {
+
+               DEBUGLOG("Told to go away by %s\n", nodename);
+               log_error("Told to go away by %s\n", nodename);
+               exit(99);
+       }
+
+       /* Version check is internal - don't bother exposing it in
+          clvmd-command.c */
+       if (msg->cmd == CLVMD_CMD_VERSION) {
+               int *version_nums = (int *) msg->args;
+               char node[256];
+               name_from_csid(csid, node);
+               DEBUGLOG("Remote node %s is version %d.%d.%d\n",
+                        node,
+                        ntohl(version_nums[0]),
+                        ntohl(version_nums[1]), ntohl(version_nums[2]));
+
+               if (ntohl(version_nums[0]) != CLVMD_MAJOR_VERSION) {
+                       struct clvm_header byebyemsg;
+                       DEBUGLOG
+                           ("Telling node %s to go away because of incompatible version number\n",
+                            node);
+                       log_notice
+                           ("Telling node %s to go away because of incompatible version number %d.%d.%d\n",
+                            node, ntohl(version_nums[0]),
+                            ntohl(version_nums[1]), ntohl(version_nums[2]));
+
+                       byebyemsg.cmd = CLVMD_CMD_GOAWAY;
+                       byebyemsg.status = 0;
+                       byebyemsg.flags = 0;
+                       byebyemsg.arglen = 0;
+                       byebyemsg.clientid = 0;
+                       cluster_send_message(&byebyemsg, sizeof(byebyemsg),
+                                            our_csid,
+                                            "Error Sending GOAWAY message");
+               } else {
+                       add_up_node(csid);
+               }
+               return;
+       }
+
+       /* Allocate a default reply buffer */
+       replyargs = malloc(MAX_CLUSTER_MESSAGE - sizeof(struct clvm_header));
+
+       if (replyargs != NULL) {
+               /* Run the command */
+               status =
+                   do_command(NULL, msg, msglen, &replyargs, buflen,
+                              &replylen);
+       } else {
+               status = -ENOMEM;
+       }
+
+       /* If it wasn't a reply, then reply */
+       if (msg->cmd != CLVMD_CMD_REPLY) {
+               char *aggreply;
+
+               aggreply =
+                   realloc(replyargs, replylen + sizeof(struct clvm_header));
+               if (aggreply) {
+                       struct clvm_header *agghead =
+                           (struct clvm_header *) aggreply;
+
+                       replyargs = aggreply;
+                       /* Move it up so there's room for a header in front of the data */
+                       memmove(aggreply + offsetof(struct clvm_header, args),
+                               replyargs, replylen);
+
+                       agghead->xid = msg->xid;
+
+                       /* Use the system LV ? */
+                       if (replylen > MAX_INLINE_MESSAGE) {
+                               agghead->cmd = CLVMD_CMD_REPLY;
+                               agghead->status = status;
+                               agghead->flags = CLVMD_FLAG_SYSTEMLV;
+                               agghead->clientid = msg->clientid;
+                               agghead->arglen = replylen;
+                               agghead->node[0] = '\0';
+
+                               /* If System LV operation failed then report it as EFBIG but only do it
+                                  if the data buffer has something in it. */
+                               if (system_lv_write_data
+                                   (aggreply,
+                                    replylen + sizeof(struct clvm_header)) < 0
+                                   && replylen > 0)
+                                       agghead->status = -EFBIG;
+
+                               send_message(agghead,
+                                            sizeof(struct clvm_header), csid,
+                                            fd,
+                                            "Error sending long command reply");
+
+                       } else {
+                               agghead->cmd = CLVMD_CMD_REPLY;
+                               agghead->status = status;
+                               agghead->flags = 0;
+                               agghead->clientid = msg->clientid;
+                               agghead->arglen = replylen;
+                               agghead->node[0] = '\0';
+                               send_message(aggreply,
+                                            sizeof(struct clvm_header) +
+                                            replylen + 2, csid, fd,
+                                            "Error sending command reply");
+                       }
+               } else {
+                       struct clvm_header head;
+
+                       DEBUGLOG("Error attempting to realloc return buffer\n");
+                       /* Return a failure response */
+                       head.cmd = CLVMD_CMD_REPLY;
+                       head.status = -ENOMEM;
+                       head.flags = 0;
+                       head.clientid = msg->clientid;
+                       head.arglen = 0;
+                       head.node[0] = '\0';
+                       send_message(&head, sizeof(struct clvm_header), csid,
+                                    fd, "Error sending ENOMEM command reply");
+                       return;
+               }
+       }
+
+       /* Free buffer if it was malloced */
+       if (msg_malloced) {
+               free(msg);
+       }
+       free(replyargs);
+}
+
+/* Add a reply to a command to the list of replies for this client.
+   If we have got a full set then send them to the waiting client down the local
+   socket */
+static void add_reply_to_list(struct local_client *client, int status,
+                             char *csid, const char *buf, int len)
+{
+       struct node_reply *reply;
+
+       pthread_mutex_lock(&client->bits.localsock.reply_mutex);
+
+       /* Add it to the list of replies */
+       reply = malloc(sizeof(struct node_reply));
+       if (reply) {
+               reply->status = status;
+               name_from_csid(csid, reply->node);
+               DEBUGLOG("Reply from node %s: %d bytes\n", reply->node, len);
+
+               if (len > 0) {
+                       reply->replymsg = malloc(len);
+                       if (!reply->replymsg) {
+                               reply->status = -ENOMEM;
+                       } else {
+                               memcpy(reply->replymsg, buf, len);
+                       }
+               } else {
+                       reply->replymsg = NULL;
+               }
+               /* Hook it onto the reply chain */
+               reply->next = client->bits.localsock.replies;
+               client->bits.localsock.replies = reply;
+       } else {
+               /* It's all gone horribly wrong... */
+               pthread_mutex_unlock(&client->bits.localsock.reply_mutex);
+               send_local_reply(client, ENOMEM, client->fd);
+               return;
+       }
+       DEBUGLOG("Got %d replies, expecting: %d\n",
+                client->bits.localsock.num_replies + 1,
+                client->bits.localsock.expected_replies);
+
+       /* If we have the whole lot then do the post-process */
+       if (++client->bits.localsock.num_replies ==
+           client->bits.localsock.expected_replies) {
+               /* Post-process the command */
+               if (client->bits.localsock.threadid) {
+                       pthread_mutex_lock(&client->bits.localsock.mutex);
+                       client->bits.localsock.state = POST_COMMAND;
+                       pthread_cond_signal(&client->bits.localsock.cond);
+                       pthread_mutex_unlock(&client->bits.localsock.mutex);
+               }
+       }
+       pthread_mutex_unlock(&client->bits.localsock.reply_mutex);
+}
+
+/* This is the thread that runs the PRE and post commands for a particular connection */
+static void *pre_and_post_thread(void *arg)
+{
+       struct local_client *client = (struct local_client *) arg;
+       int status;
+       sigset_t ss;
+       int pipe_fd = client->bits.localsock.pipe;
+
+       DEBUGLOG("in sub thread: client = %p\n", client);
+
+       /* Ignore SIGUSR1 (handled by master process) but enable
+          SIGUSR2 (kills subthreads) */
+       sigemptyset(&ss);
+       sigaddset(&ss, SIGUSR1);
+       pthread_sigmask(SIG_BLOCK, &ss, NULL);
+
+       sigdelset(&ss, SIGUSR1);
+       sigaddset(&ss, SIGUSR2);
+       pthread_sigmask(SIG_UNBLOCK, &ss, NULL);
+
+       /* Loop around doing PRE and POST functions until the client goes away */
+       while (!client->bits.localsock.finished) {
+               /* Execute the code */
+               status = do_pre_command(client);
+
+               if (status)
+                       client->bits.localsock.all_success = 0;
+
+               DEBUGLOG("Writing status %d down pipe %d\n", status, pipe_fd);
+               /* Tell the parent process we have finished this bit */
+               write(pipe_fd, &status, sizeof(int));
+
+               /* We may need to wait for the condition variable before running the post command */
+               pthread_mutex_lock(&client->bits.localsock.mutex);
+               DEBUGLOG("Waiting to do post command - state = %d\n",
+                        client->bits.localsock.state);
+
+               if (client->bits.localsock.state != POST_COMMAND) {
+                       pthread_cond_wait(&client->bits.localsock.cond,
+                                         &client->bits.localsock.mutex);
+               }
+               pthread_mutex_unlock(&client->bits.localsock.mutex);
+
+               DEBUGLOG("Got post command condition...\n");
+
+               do_post_command(client);
+
+               write(pipe_fd, &status, sizeof(int));
+
+               if (client->bits.localsock.finished)
+                       break;
+
+               DEBUGLOG("Waiting for next pre command\n");
+
+               pthread_mutex_lock(&client->bits.localsock.mutex);
+               if (client->bits.localsock.state != PRE_COMMAND) {
+                       pthread_cond_wait(&client->bits.localsock.cond,
+                                         &client->bits.localsock.mutex);
+               }
+               pthread_mutex_unlock(&client->bits.localsock.mutex);
+
+               DEBUGLOG("Got pre command condition...\n");
+       }
+       DEBUGLOG("Subthread finished\n");
+       return (void *) 0;
+}
+
+/* Process a command on the local node and store the result */
+static int process_local_command(struct clvm_header *msg, int msglen,
+                                struct local_client *client,
+                                unsigned short xid)
+{
+       char *replybuf = malloc(MAX_CLUSTER_MESSAGE);
+       int buflen = MAX_CLUSTER_MESSAGE - sizeof(struct clvm_header) - 1;
+       int replylen = 0;
+       int status;
+
+       DEBUGLOG("process_local_command: msg=%p, msglen =%d, client=%p\n", msg,
+                msglen, client);
+       if (replybuf == NULL)
+               return -1;
+
+       status = do_command(client, msg, msglen, &replybuf, buflen, &replylen);
+
+       if (status)
+               client->bits.localsock.all_success = 0;
+
+       /* If we took too long then discard the reply */
+       if (xid == client->xid) {
+               add_reply_to_list(client, status, our_csid, replybuf, replylen);
+       } else {
+               DEBUGLOG
+                   ("Local command took too long, discarding xid %d, current is %d\n",
+                    xid, client->xid);
+       }
+
+       free(replybuf);
+       return status;
+}
+
+static int process_reply(struct clvm_header *msg, int msglen, char *csid)
+{
+       struct local_client *client = NULL;
+
+       client = find_client(msg->clientid);
+       if (!client) {
+               DEBUGLOG("Got message for unknown client 0x%x\n",
+                        msg->clientid);
+               log_error("Got message for unknown client 0x%x\n",
+                         msg->clientid);
+               return -1;
+       }
+
+       if (msg->status)
+               client->bits.localsock.all_success = 0;
+
+       /* Gather replies together for this client id */
+       if (msg->xid == client->xid) {
+               add_reply_to_list(client, msg->status, csid, msg->args,
+                                 msg->arglen);
+       } else {
+               DEBUGLOG("Discarding reply with old XID %d, current = %d\n",
+                        msg->xid, client->xid);
+       }
+       return 0;
+}
+
+/* Send an aggregated reply back to the client */
+static void send_local_reply(struct local_client *client, int status, int fd)
+{
+       struct clvm_header *clientreply;
+       struct node_reply *thisreply = client->bits.localsock.replies;
+       char *replybuf;
+       char *ptr;
+       int message_len = 0;
+
+       DEBUGLOG("Send local reply\n");
+
+       /* Work out the total size of the reply */
+       while (thisreply) {
+               if (thisreply->replymsg)
+                       message_len += strlen(thisreply->replymsg) + 1;
+               else
+                       message_len++;
+
+               message_len += strlen(thisreply->node) + 1 + sizeof(int);
+
+               thisreply = thisreply->next;
+       }
+
+       /* Add in the size of our header */
+       message_len = message_len + sizeof(struct clvm_header) + 1;
+       replybuf = malloc(message_len);
+
+       clientreply = (struct clvm_header *) replybuf;
+       clientreply->status = -status;
+       clientreply->cmd = CLVMD_CMD_REPLY;
+       clientreply->node[0] = '\0';
+
+       ptr = clientreply->args;
+
+       /* Add in all the replies, and free them as we go */
+       thisreply = client->bits.localsock.replies;
+       while (thisreply) {
+               struct node_reply *tempreply = thisreply;
+
+               strcpy(ptr, thisreply->node);
+               ptr += strlen(thisreply->node) + 1;
+
+               *(int *) ptr = thisreply->status;
+               ptr += sizeof(int);
+
+               if (thisreply->replymsg) {
+                       strcpy(ptr, thisreply->replymsg);
+                       ptr += strlen(thisreply->replymsg) + 1;
+               } else {
+                       ptr[0] = '\0';
+                       ptr++;
+               }
+               thisreply = thisreply->next;
+
+               if (tempreply->replymsg)
+                       free(tempreply->replymsg);
+               free(tempreply);
+       }
+
+       /* Terminate with an empty node name */
+       *ptr = '\0';
+
+       clientreply->arglen = ptr - clientreply->args + 1;
+
+       /* And send it */
+       send_message(replybuf, message_len, our_csid, fd,
+                    "Error sending REPLY to client");
+       free(replybuf);
+
+       /* Reset comms variables */
+       client->bits.localsock.replies = NULL;
+       client->bits.localsock.expected_replies = 0;
+       client->bits.localsock.in_progress = FALSE;
+       client->bits.localsock.sent_out = FALSE;
+}
+
+/* Just free a reply chain baceuse it wasn't used. */
+static void free_reply(struct local_client *client)
+{
+       /* Add in all the replies, and free them as we go */
+       struct node_reply *thisreply = client->bits.localsock.replies;
+       while (thisreply) {
+               struct node_reply *tempreply = thisreply;
+
+               thisreply = thisreply->next;
+
+               if (tempreply->replymsg)
+                       free(tempreply->replymsg);
+               free(tempreply);
+       }
+       client->bits.localsock.replies = NULL;
+}
+
+/* Send our version number to the cluster */
+static void send_version_message()
+{
+       char message[sizeof(struct clvm_header) + sizeof(int) * 3];
+       struct clvm_header *msg = (struct clvm_header *) message;
+       int *version_nums = (int *) msg->args;
+
+       msg->cmd = CLVMD_CMD_VERSION;
+       msg->status = 0;
+       msg->flags = 0;
+       msg->clientid = 0;
+       msg->arglen = sizeof(int) * 3;
+
+       version_nums[0] = htonl(CLVMD_MAJOR_VERSION);
+       version_nums[1] = htonl(CLVMD_MINOR_VERSION);
+       version_nums[2] = htonl(CLVMD_PATCH_VERSION);
+
+       cluster_send_message(message, sizeof(message), NULL,
+                            "Error Sending version number");
+}
+
+/* Send a message to either a local client or another server */
+static int send_message(void *buf, int msglen, char *csid, int fd,
+                       const char *errtext)
+{
+       int len;
+
+       /* Send remote messages down the cluster socket */
+       if (csid == NULL || !ISLOCAL_CSID(csid)) {
+               hton_clvm((struct clvm_header *) buf);  /* Byte swap if necessary */
+               return cluster_send_message(buf, msglen, csid, errtext);
+       } else {
+               int ptr = 0;
+
+               /* Make sure it all goes */
+               do {
+                       len = write(fd, buf + ptr, msglen - ptr);
+
+                       if (len <= 0) {
+                               log_error(errtext);
+                               break;
+                       }
+                       ptr += len;
+               } while (len < msglen);
+       }
+       return len;
+}
+
+static int process_work_item(struct lvm_thread_cmd *cmd)
+{
+       if (!cmd->remote) {
+               DEBUGLOG("process_work_item: local\n");
+               process_local_command(cmd->msg, cmd->msglen, cmd->client,
+                                     cmd->xid);
+       } else {
+               DEBUGLOG("process_work_item: remote\n");
+               process_remote_command(cmd->msg, cmd->msglen, cmd->client->fd,
+                                      cmd->csid);
+       }
+       return 0;
+}
+
+/*
+ * Routine that runs in the "LVM thread".
+ */
+static void *lvm_thread_fn(void *arg)
+{
+       struct list *cmdl, *tmp;
+       sigset_t ss;
+
+       DEBUGLOG("LVM thread function started\n");
+       pthread_mutex_lock(&lvm_thread_mutex);
+
+       /* Ignore SIGUSR1 & 2 */
+       sigemptyset(&ss);
+       sigaddset(&ss, SIGUSR1);
+       sigaddset(&ss, SIGUSR2);
+       pthread_sigmask(SIG_BLOCK, &ss, NULL);
+
+       /* Initialise the interface to liblvm */
+       init_lvm();
+       pthread_mutex_unlock(&lvm_thread_mutex);
+
+       /* Now wait for some actual work */
+       for (;;) {
+               DEBUGLOG("LVM thread waiting for work\n");
+
+               pthread_mutex_lock(&lvm_thread_mutex);
+               if (list_empty(&lvm_cmd_head))
+                       pthread_cond_wait(&lvm_thread_cond, &lvm_thread_mutex);
+
+               list_iterate_safe(cmdl, tmp, &lvm_cmd_head) {
+                       struct lvm_thread_cmd *cmd;
+
+                       cmd =
+                           list_struct_base(cmdl, struct lvm_thread_cmd, list);
+                       list_del(&cmd->list);
+                       pthread_mutex_unlock(&lvm_thread_mutex);
+
+                       process_work_item(cmd);
+                       free(cmd->msg);
+                       free(cmd);
+
+                       pthread_mutex_lock(&lvm_thread_mutex);
+               }
+               pthread_mutex_unlock(&lvm_thread_mutex);
+       }
+}
+
+/* Pass down some work to the LVM thread */
+static int add_to_lvmqueue(struct local_client *client, struct clvm_header *msg,
+                          int msglen, char *csid)
+{
+       struct lvm_thread_cmd *cmd;
+
+       cmd = malloc(sizeof(struct lvm_thread_cmd));
+       if (!cmd)
+               return -ENOMEM;
+
+       cmd->msg = malloc(msglen);
+       if (!cmd->msg) {
+               log_error("Unable to allocate buffer space\n");
+               free(cmd);
+               return -1;
+       }
+
+       cmd->client = client;
+       cmd->msglen = msglen;
+       cmd->xid = client->xid;
+       memcpy(cmd->msg, msg, msglen);
+       if (csid) {
+               memcpy(cmd->csid, csid, MAX_CSID_LEN);
+               cmd->remote = 1;
+       } else {
+               cmd->remote = 0;
+       }
+
+       DEBUGLOG
+           ("add_to_lvmqueue: cmd=%p. client=%p, msg=%p, len=%d, csid=%p, xid=%d\n",
+            cmd, client, msg, msglen, csid, cmd->xid);
+       pthread_mutex_lock(&lvm_thread_mutex);
+       list_add(&lvm_cmd_head, &cmd->list);
+       pthread_cond_signal(&lvm_thread_cond);
+       pthread_mutex_unlock(&lvm_thread_mutex);
+
+       return 0;
+}
+
+/* Open the local socket, that's the one we talk to libclvm down */
+static int open_local_sock()
+{
+       int local_socket;
+       struct sockaddr_un sockaddr;
+
+       /* Open local socket */
+       if (CLVMD_SOCKNAME[0] != '\0')
+               unlink(CLVMD_SOCKNAME);
+       local_socket = socket(PF_UNIX, SOCK_STREAM, 0);
+       if (local_socket < 0) {
+               log_error("Can't create local socket: %m");
+               return -1;
+       }
+
+       memset(&sockaddr, 0, sizeof(sockaddr));
+       memcpy(sockaddr.sun_path, CLVMD_SOCKNAME, sizeof(CLVMD_SOCKNAME));
+       sockaddr.sun_family = AF_UNIX;
+       if (bind(local_socket, (struct sockaddr *) &sockaddr, sizeof(sockaddr))) {
+               log_error("can't bind local socket: %m");
+               close(local_socket);
+               return -1;
+       }
+       if (listen(local_socket, 1) != 0) {
+               log_error("listen local: %m");
+               close(local_socket);
+               return -1;
+       }
+       if (CLVMD_SOCKNAME[0] != '\0')
+               chmod(CLVMD_SOCKNAME, 0600);
+
+       return local_socket;
+}
+
+static void check_all_callback(struct local_client *client, char *csid,
+                              int node_up)
+{
+       if (!node_up)
+               add_reply_to_list(client, -EHOSTDOWN, csid, "CLVMD not running",
+                                 18);
+}
+
+/* Check to see if all CLVMDs are running (ie one on
+   every node in the cluster).
+   If not, returns -1 and prints out a list of errant nodes */
+static int check_all_clvmds_running(struct local_client *client)
+{
+       DEBUGLOG("check_all_clvmds_running\n");
+       return cluster_do_node_callback(client, check_all_callback);
+}
+
+/* Return a local_client struct given a client ID.
+   client IDs are in network byte order */
+static struct local_client *find_client(int clientid)
+{
+       struct local_client *thisfd;
+       for (thisfd = &local_client_head; thisfd != NULL; thisfd = thisfd->next) {
+               if (thisfd->fd == ntohl(clientid))
+                       return thisfd;
+       }
+       return NULL;
+}
+
+/* Byte-swapping routines for the header so we
+   work in a heterogeneous environment */
+static void hton_clvm(struct clvm_header *hdr)
+{
+       hdr->status = htonl(hdr->status);
+       hdr->arglen = htonl(hdr->arglen);
+       hdr->xid = htons(hdr->xid);
+       /* Don't swap clientid as it's only a token as far as
+          remote nodes are concerned */
+}
+
+static void ntoh_clvm(struct clvm_header *hdr)
+{
+       hdr->status = ntohl(hdr->status);
+       hdr->arglen = ntohl(hdr->arglen);
+       hdr->xid = ntohs(hdr->xid);
+}
+
+/* Handler for SIGUSR2 - sent to kill subthreads */
+static void sigusr2_handler(int sig)
+{
+       DEBUGLOG("SIGUSR2 received\n");
+       pthread_exit((void *) -1);
+       return;
+}
+
+static void sigterm_handler(int sig)
+{
+       DEBUGLOG("SIGTERM received\n");
+       quit = 1;
+       return;
+}
diff --git a/daemons/clvmd/clvmd.h b/daemons/clvmd/clvmd.h

new file mode 100644 (file)

index 0000000..46e53c4
--- /dev/null
+++ b/daemons/clvmd/clvmd.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _CLVMD_H
+#define _CLVMD_H
+
+#define CLVMD_MAJOR_VERSION 0
+#define CLVMD_MINOR_VERSION 2
+#define CLVMD_PATCH_VERSION 1
+
+/* Name of the cluster LVM admin lock */
+#define ADMIN_LOCK_NAME "CLVMD_ADMIN"
+
+/* Default time (in seconds) we will wait for all remote commands to execute
+   before declaring them dead */
+#define DEFAULT_CMD_TIMEOUT 60
+
+/* One of these for each reply we get from command execution on a node */
+struct node_reply {
+       char node[MAX_CLUSTER_MEMBER_NAME_LEN];
+       char *replymsg;
+       int status;
+       struct node_reply *next;
+};
+
+/*
+ * These exist for the use of local sockets only when we are
+ * collecting responses from all cluster nodes
+ */
+struct localsock_bits {
+       struct node_reply *replies;
+       int num_replies;
+       int expected_replies;
+       time_t sent_time;       /* So we can check for timeouts */
+       int in_progress;        /* Only execute one cmd at a time per client */
+       int sent_out;           /* Flag to indicate that a command was sent
+                                  to remote nodes */
+       void *private;          /* Private area for command processor use */
+       void *cmd;              /* Whole command as passed down local socket */
+       int cmd_len;            /* Length of above */
+       int pipe;               /* Pipe to send PRE completion status down */
+       int finished;           /* Flag to tell subthread to exit */
+       int all_success;        /* Set to 0 if any node (or the pre_command)
+                                  failed */
+       struct local_client *pipe_client;
+       pthread_t threadid;
+       enum { PRE_COMMAND, POST_COMMAND, QUIT } state;
+       pthread_mutex_t mutex;  /* Main thread and worker synchronisation */
+       pthread_cond_t cond;
+
+       pthread_mutex_t reply_mutex;    /* Protect reply structure */
+};
+
+/* Entries for PIPE clients */
+struct pipe_bits {
+       struct local_client *client;    /* Actual (localsock) client */
+       pthread_t threadid;             /* Our own copy of the thread id */
+};
+
+/* Entries for Network socket clients */
+struct netsock_bits {
+       void *private;
+       int flags;
+};
+
+typedef int (*fd_callback_t) (struct local_client * fd, char *buf, int len,
+                             char *csid, struct local_client ** new_client);
+
+/* One of these for each fd we are listening on */
+struct local_client {
+       int fd;
+       enum { CLUSTER_MAIN_SOCK, CLUSTER_DATA_SOCK, LOCAL_RENDEZVOUS,
+                   LOCAL_SOCK, THREAD_PIPE, CLUSTER_INTERNAL } type;
+       struct local_client *next;
+       unsigned short xid;
+       fd_callback_t callback;
+
+       union {
+               struct localsock_bits localsock;
+               struct pipe_bits pipe;
+               struct netsock_bits net;
+       } bits;
+};
+
+#ifdef DEBUG
+#define DEBUGLOG(fmt, args...) fprintf(stderr, "CLVMD[%d]: %ld ", getpid(), time(NULL) ); fprintf(stderr, fmt, ## args)
+#else
+#define DEBUGLOG(fmt, args...)
+#endif
+
+#ifndef max
+#define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+/* The real command processor is in clvmd-command.c */
+extern int do_command(struct local_client *client, struct clvm_header *msg,
+                     int msglen, char **buf, int buflen, int *retlen);
+
+/* Pre and post command routines are called only on the local node */
+extern int do_pre_command(struct local_client *client);
+extern int do_post_command(struct local_client *client);
+
+extern int add_client(struct local_client *new_client);
+
+extern void clvmd_cluster_init_completed(void);
+
+#endif
diff --git a/daemons/clvmd/cnxman-socket.h b/daemons/clvmd/cnxman-socket.h

new file mode 100644 (file)

index 0000000..8ae44d8
--- /dev/null
+++ b/daemons/clvmd/cnxman-socket.h
@@ -0,0 +1,226 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* CMAN socket interface header,
+   may be include by user or kernel code */
+
+#ifndef __CNXMAN_SOCKET_H
+#define __CNXMAN_SOCKET_H
+
+/* Just made these up but the address family must be less than 32 (NPROTO) */
+#define AF_CLUSTER 31
+#define PF_CLUSTER AF_CLUSTER
+
+/* Protocol(socket) types */
+#define CLPROTO_MASTER 2
+#define CLPROTO_CLIENT 3
+
+/* Setsockopt -- maybe should be ioctls?? */
+#define CLU_SET_MULTICAST  100
+#define CLU_JOIN_CLUSTER   101
+#define CLU_LEAVE_CLUSTER  102
+#define CLU_SET_RCVONLY    103
+#define CLU_SET_UNICAST    104
+#define KCL_SET_MULTICAST  105
+#define KCL_SET_RCVONLY    106
+#define KCL_SET_UNICAST    107
+#define KCL_SET_NODENAME   108
+#define CLU_SET_NODENAME   109
+
+/* ioctls -- should register these properly */
+#define SIOCCLUSTER_NOTIFY            _IOW('x', 0x01, int)
+#define SIOCCLUSTER_REMOVENOTIFY      _IO( 'x', 0x02)
+#define SIOCCLUSTER_GETMEMBERS        _IOR('x', 0x03, struct cl_cluster_nodelist)
+#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int)
+#define SIOCCLUSTER_ISQUORATE         _IO( 'x', 0x05)
+#define SIOCCLUSTER_ISLISTENING       _IOW('x', 0x06, struct cl_listen_request)
+#define SIOCCLUSTER_GETALLMEMBERS     _IOR('x', 0x07, struct cl_cluster_nodelist)
+#define SIOCCLUSTER_SET_VOTES         _IOW('x', 0x08, int)
+#define SIOCCLUSTER_GET_VERSION       _IOR('x', 0x09, struct cl_version)
+#define SIOCCLUSTER_SET_VERSION       _IOW('x', 0x0a, struct cl_version)
+#define SIOCCLUSTER_ISACTIVE          _IO( 'x', 0x0b)
+#define SIOCCLUSTER_KILLNODE          _IOW('x', 0x0c, int)
+#define SIOCCLUSTER_GET_JOINCOUNT     _IO( 'x', 0x0d)
+#define SIOCCLUSTER_SERVICE_REGISTER  _IOW('x', 0x0e, char)
+#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f)
+#define SIOCCLUSTER_SERVICE_JOIN      _IO( 'x', 0x10)
+#define SIOCCLUSTER_SERVICE_LEAVE     _IO( 'x', 0x20)
+#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int)
+#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int)
+#define SIOCCLUSTER_SERVICE_GETEVENT  _IOR('x', 0x50, struct cl_service_event)
+#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_node)
+#define SIOCCLUSTER_SERVICE_GLOBALID  _IOR('x', 0x70, uint32_t)
+#define SIOCCLUSTER_SERVICE_SETLEVEL  _IOR('x', 0x80, int)
+#define SIOCCLUSTER_GETNODE          _IOWR('x', 0x90, struct cl_cluster_node)
+#define SIOCCLUSTER_BARRIER           _IOW('x', 0x0a0, struct cl_barrier_info)
+
+/* Maximum size of a cluster message */
+#define MAX_CLUSTER_MESSAGE          1500
+#define MAX_CLUSTER_MEMBER_NAME_LEN   255
+#define MAX_BARRIER_NAME_LEN           33
+#define MAX_SA_ADDR_LEN                12
+#define MAX_CLUSTER_NAME_LEN           16
+
+/* Well-known cluster port numbers */
+#define CLUSTER_PORT_MEMBERSHIP  1     /* Mustn't block during cluster
+                                        * transitions! */
+#define CLUSTER_PORT_SERVICES    2
+#define CLUSTER_PORT_SYSMAN      10    /* Remote execution daemon */
+#define CLUSTER_PORT_CLVMD       11    /* Cluster LVM daemon */
+#define CLUSTER_PORT_SLM         12    /* LVM SLM (simple lock manager) */
+
+/* Port numbers above this will be blocked when the cluster is inquorate or in
+ * transition */
+#define HIGH_PROTECTED_PORT      9
+
+/* Reasons for leaving the cluster */
+#define CLUSTER_LEAVEFLAG_DOWN     0   /* Normal shutdown */
+#define CLUSTER_LEAVEFLAG_KILLED   1
+#define CLUSTER_LEAVEFLAG_PANIC    2
+#define CLUSTER_LEAVEFLAG_REMOVED  3   /* This one can reduce quorum */
+#define CLUSTER_LEAVEFLAG_REJECTED 4   /* Not allowed into the cluster in the
+                                        * first place */
+#define CLUSTER_LEAVEFLAG_INCONSISTENT 5       /* Our view of the cluster is
+                                                * in a minority */
+#define CLUSTER_LEAVEFLAG_DEAD         6       /* Discovered to be dead */
+#define CLUSTER_LEAVEFLAG_FORCE     0x10       /* Forced by command-line */
+
+/* OOB messages sent to a local socket */
+#define CLUSTER_OOB_MSG_PORTCLOSED  1
+#define CLUSTER_OOB_MSG_STATECHANGE 2
+#define CLUSTER_OOB_MSG_SERVICEEVENT 3
+
+/* Sendmsg flags, these are above the normal sendmsg flags so they don't
+ * interfere */
+#define MSG_NOACK     0x010000 /* Don't need an ACK for this message */
+#define MSG_QUEUE     0x020000 /* Queue the message for sending later */
+#define MSG_MULTICAST 0x080000 /* Message was sent to all nodes in the cluster
+                                */
+#define MSG_ALLINT    0x100000 /* Send out of all interfaces */
+
+typedef enum { NODESTATE_REMOTEMEMBER, NODESTATE_JOINING, NODESTATE_MEMBER,
+           NODESTATE_DEAD } nodestate_t;
+
+
+struct sockaddr_cl {
+       unsigned short scl_family;
+       unsigned char scl_flags;
+       unsigned char scl_port;
+       int           scl_nodeid;
+};
+
+/* This is how we pass the multicast socket into kernel space. addr is the
+ * multicast address to use in the address family of the socket (eg for UDP it
+ * might be 255.255.255.0) */
+struct cl_multicast_sock {
+       int fd;                 /* FD of master socket to do multicast on */
+       int number;             /* Socket number, to match up recvonly & bcast
+                                * sockets */
+};
+
+/* Cluster configuration info passed when we join the cluster */
+struct cl_join_cluster_info {
+       unsigned char votes;
+       unsigned int expected_votes;
+       unsigned int two_node;
+       unsigned int config_version;
+
+        char cluster_name[17];
+};
+
+
+/* This is the structure, per node, returned from the membership ioctl */
+struct cl_cluster_node {
+       unsigned int size;
+       unsigned int node_id;
+       unsigned int us;
+       unsigned int leave_reason;
+       unsigned int incarnation;
+       nodestate_t state;
+       char name[MAX_CLUSTER_MEMBER_NAME_LEN];
+       unsigned char votes;
+};
+
+/* The struct passed to the membership ioctls */
+struct cl_cluster_nodelist {
+        uint32_t max_members;
+        struct cl_cluster_node *nodes;
+};
+
+/* Structure passed to SIOCCLUSTER_ISLISTENING */
+struct cl_listen_request {
+       unsigned char port;
+        int           nodeid;
+};
+
+/* A Cluster PORTCLOSED message - received by a local user as an OOB message */
+struct cl_portclosed_oob {
+       unsigned char cmd;      /* CLUSTER_OOB_MSG_PORTCLOSED */
+       unsigned char port;
+};
+
+/* Get all version numbers or set the config version */
+struct cl_version {
+       unsigned int major;
+       unsigned int minor;
+       unsigned int patch;
+       unsigned int config;
+};
+
+/* structure passed to barrier ioctls */
+struct cl_barrier_info {
+       char cmd;
+       char name[MAX_BARRIER_NAME_LEN];
+       unsigned int flags;
+       unsigned long arg;
+};
+
+typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH,
+               SERVICE_EVENT_LEAVEDONE } service_event_t;
+
+typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE }
+               service_start_t;
+
+struct cl_service_event {
+       service_event_t type;
+       service_start_t start_type;
+       unsigned int event_id;
+       unsigned int last_stop;
+       unsigned int last_start;
+       unsigned int last_finish;
+       unsigned int node_count;
+};
+
+
+/* Commands to the barrier ioctl */
+#define BARRIER_IOCTL_REGISTER 1
+#define BARRIER_IOCTL_CHANGE   2
+#define BARRIER_IOCTL_DELETE   3
+#define BARRIER_IOCTL_WAIT     4
+
+/* Attributes of a barrier - bitmask */
+#define BARRIER_ATTR_AUTODELETE 1
+#define BARRIER_ATTR_MULTISTEP  2
+#define BARRIER_ATTR_MANUAL     4
+#define BARRIER_ATTR_ENABLED    8
+#define BARRIER_ATTR_CALLBACK  16
+
+/* Attribute setting commands */
+#define BARRIER_SETATTR_AUTODELETE 1
+#define BARRIER_SETATTR_MULTISTEP  2
+#define BARRIER_SETATTR_ENABLED    3
+#define BARRIER_SETATTR_NODES      4
+#define BARRIER_SETATTR_CALLBACK   5
+#define BARRIER_SETATTR_TIMEOUT    6
+
+#endif
diff --git a/daemons/clvmd/libclvm.c b/daemons/clvmd/libclvm.c

new file mode 100644 (file)

index 0000000..085e57e
--- /dev/null
+++ b/daemons/clvmd/libclvm.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright (C) 1997-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* library functions for Cluster LVM Daemon */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/un.h>
+#include <sys/time.h>
+#include <sys/ioctl.h>
+#include <sys/utsname.h>
+#include <syslog.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <signal.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <search.h>
+#include <errno.h>
+
+#include "clvm.h"
+#include "libclvm.h"
+
+/* CLVM in hex! */
+#define LVM_SIGNATURE 0x434C564D
+
+#define MAX_CLUSTER_MEMBER_NAME_LEN 255
+
+/* NOTE: the LVMD uses the socket FD as the client ID, this means
+   that any client that calls fork() will inherit the context of
+   it's parent. */
+static int clvmd_sock = -1;
+
+static int open_local_sock(void)
+{
+       int local_socket;
+       struct sockaddr_un sockaddr;
+
+       /* Open local socket */
+       local_socket = socket(PF_UNIX, SOCK_STREAM, 0);
+       if (local_socket < 0) {
+               perror("Can't create local socket");
+               return -1;
+       }
+
+       fcntl(local_socket, F_SETFD, !FD_CLOEXEC);
+
+       strcpy(sockaddr.sun_path, CLVMD_SOCKNAME);
+       sockaddr.sun_family = AF_UNIX;
+       if (connect
+           (local_socket, (struct sockaddr *) &sockaddr, sizeof(sockaddr))) {
+               int saved_errno = errno;
+
+               close(local_socket);
+
+               errno = saved_errno;
+               return -1;
+       }
+       return local_socket;
+}
+
+/* Send a request and return the status */
+static int send_request(char *inbuf, int inlen, char **retbuf)
+{
+       char outbuf[PIPE_BUF];
+       struct clvm_header *outheader = (struct clvm_header *) outbuf;
+       int len;
+       int off;
+       fd_set fds;
+
+       FD_ZERO(&fds);
+       FD_SET(clvmd_sock, &fds);
+
+       /* Send it to CLVMD */
+       if (write(clvmd_sock, inbuf, inlen) != inlen) {
+               perror("Error writing to CLVMD");
+               return -1;
+       }
+
+       /* Get the response */
+       if ((len = read(clvmd_sock, outbuf, sizeof(struct clvm_header))) < 0) {
+               perror("Error reading CLVMD");
+               return -1;
+       }
+       if (len == 0) {
+               fprintf(stderr, "EOF reading CLVMD");
+               errno = ENOTCONN;
+               return -1;
+       }
+
+       /* Allocate buffer */
+       *retbuf = malloc(len + outheader->arglen);
+       if (!*retbuf) {
+               errno = ENOMEM;
+               return -1;
+       }
+
+       /* Copy the header */
+       memcpy(*retbuf, outbuf, len);
+       outheader = (struct clvm_header *) *retbuf;
+
+       /* Read the returned values */
+       off = 1;                /* we've already read the first byte */
+
+       while (off < outheader->arglen && len > 0) {
+               len = read(clvmd_sock, outheader->args + off, PIPE_BUF);
+               if (len > 0)
+                       off += len;
+       }
+
+       /* Was it an error ? */
+       if (outheader->status < 0) {
+               errno = -outheader->status;
+               return -2;
+       }
+       return 0;
+}
+
+/* Build the structure header and parse-out wildcard node names */
+static void build_header(struct clvm_header *head, int cmd, const char *node,
+                        void *data, int len)
+{
+       head->cmd = cmd;
+       head->status = 0;
+       head->flags = 0;
+       head->clientid = 0;
+       head->arglen = len;
+       if (node) {
+               /* Allow a couple of special node names:
+                  "*" for all nodes,
+                  "." for the local node only
+                */
+               if (strcmp(node, "*") == 0) {
+                       head->node[0] = '\0';
+               } else if (strcmp(node, ".") == 0) {
+                       head->node[0] = '\0';
+                       head->flags = CLVMD_FLAG_LOCAL;
+               } else {
+                       strcpy(head->node, node);
+               }
+       } else {
+               head->node[0] = '\0';
+       }
+}
+
+/* Send a message to a(or all) node(s) in the cluster */
+int lvm_cluster_write(char cmd, char *node, void *data, int len)
+{
+       char outbuf[sizeof(struct clvm_header) + len + strlen(node) + 1];
+       char *retbuf = NULL;
+       int status;
+       struct clvm_header *head = (struct clvm_header *) outbuf;
+
+       if (clvmd_sock == -1)
+               clvmd_sock = open_local_sock();
+       if (clvmd_sock == -1)
+               return -1;
+
+       build_header(head, cmd, node, data, len);
+       memcpy(head->node + strlen(head->node) + 1, data, len);
+
+       status =
+           send_request(outbuf,
+                        sizeof(struct clvm_header) + strlen(head->node) + len,
+                        &retbuf);
+       if (retbuf)
+               free(retbuf);
+
+       return status;
+}
+
+/* API: Send a message to a(or all) node(s) in the cluster
+   and wait for replies */
+int lvm_cluster_request(char cmd, const char *node, void *data, int len,
+                       lvm_response_t ** response, int *num)
+{
+       char outbuf[sizeof(struct clvm_header) + len + strlen(node) + 1];
+       int *outptr;
+       char *inptr;
+       char *retbuf = NULL;
+       int status;
+       int i;
+       int num_responses = 0;
+       struct clvm_header *head = (struct clvm_header *) outbuf;
+       lvm_response_t *rarray;
+
+       *num = 0;
+
+       if (clvmd_sock == -1)
+               clvmd_sock = open_local_sock();
+       if (clvmd_sock == -1)
+               return -1;
+
+       build_header(head, cmd, node, data, len);
+       memcpy(head->node + strlen(head->node) + 1, data, len);
+
+       status =
+           send_request(outbuf,
+                        sizeof(struct clvm_header) + strlen(head->node) + len,
+                        &retbuf);
+       if (status == 0 || status == -2) {
+               /* Count the number of responses we got */
+               head = (struct clvm_header *) retbuf;
+               inptr = head->args;
+               while (inptr[0]) {
+                       num_responses++;
+                       inptr += strlen(inptr) + 1;
+                       inptr += sizeof(int);
+                       inptr += strlen(inptr) + 1;
+               }
+
+               /* Allocate response array. With an extra pair of INTs on the front to sanity
+                  check the pointer when we are given it back to free */
+               outptr =
+                   malloc(sizeof(lvm_response_t) * num_responses +
+                          sizeof(int) * 2);
+               if (!outptr) {
+                       if (retbuf)
+                               free(retbuf);
+                       errno = ENOMEM;
+                       return -1;
+               }
+
+               *response = (lvm_response_t *) (outptr + 2);
+               outptr[0] = LVM_SIGNATURE;
+               outptr[1] = num_responses;
+               rarray = *response;
+
+               /* Unpack the response into an lvm_response_t array */
+               inptr = head->args;
+               i = 0;
+               while (inptr[0]) {
+                       strcpy(rarray[i].node, inptr);
+                       inptr += strlen(inptr) + 1;
+
+                       rarray[i].status = *(int *) inptr;
+                       inptr += sizeof(int);
+
+                       rarray[i].response = malloc(strlen(inptr) + 1);
+                       if (rarray[i].response == NULL) {
+                               /* Free up everything else and return error */
+                               int j;
+                               for (j = 0; j < i; j++)
+                                       free(rarray[i].response);
+                               free(outptr);
+                               errno = ENOMEM;
+                               return -1;
+                       }
+
+                       strcpy(rarray[i].response, inptr);
+                       rarray[i].len = strlen(inptr);
+                       inptr += strlen(inptr) + 1;
+                       i++;
+               }
+               *num = num_responses;
+               *response = rarray;
+       }
+
+       if (retbuf)
+               free(retbuf);
+       return status;
+}
+
+/* API: Free reply array */
+int lvm_cluster_free_request(lvm_response_t * response)
+{
+       int *ptr = (int *) response - 2;
+       int i;
+       int num;
+
+       /* Check it's ours to free */
+       if (response == NULL || *ptr != LVM_SIGNATURE) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       num = ptr[1];
+       for (i = 0; i < num; i++) {
+               free(response[i].response);
+       }
+       free(ptr);
+
+       return 0;
+}
+
+/* These are a "higher-level" API providing black-box lock/unlock
+   functions for cluster LVM...maybe */
+
+/* Set by lock(), used by unlock() */
+static int num_responses;
+static lvm_response_t *response;
+
+int lvm_lock_for_cluster(char scope, char *name, int verbosity)
+{
+       int status;
+       int i;
+       char *args;
+       int len;
+
+       if (name) {
+               len = strlen(name) + 2;
+               args = alloca(len);
+               strcpy(args + 1, name);
+       } else {
+               len = 2;
+               args = alloca(len);
+               args[1] = '\0';
+       }
+       args[0] = scope;
+
+       status = lvm_cluster_request(CLVMD_CMD_LOCK,
+                                    "", args, len, &response, &num_responses);
+
+       /* If any nodes were down then display them and return an error */
+       for (i = 0; i < num_responses; i++) {
+               if (response[i].status == -EHOSTDOWN) {
+                       if (verbosity)
+                               fprintf(stderr,
+                                       "clvmd not running on node %s\n",
+                                       response[i].node);
+                       status = -1;
+               }
+       }
+
+       /* If there was an error then free the memory now as the caller won't
+          want to do the unlock */
+       if (status) {
+               int saved_errno = errno;
+               lvm_cluster_free_request(response);
+               num_responses = 0;
+               errno = saved_errno;
+       }
+       return status;
+}
+
+int lvm_unlock_for_cluster(char scope, char *name, int verbosity)
+{
+       int status;
+       int i;
+       int len;
+       int failed;
+       int num_unlock_responses;
+       char *args;
+       lvm_response_t *unlock_response;
+
+       /* We failed - this should not have been called */
+       if (num_responses == 0)
+               return 0;
+
+       if (name) {
+               len = strlen(name) + 2;
+               args = alloca(len);
+               strcpy(args + 1, name);
+       } else {
+               len = 2;
+               args = alloca(len);
+               args[1] = '\0';
+       }
+       args[0] = scope;
+
+       /* See if it failed anywhere */
+       failed = 0;
+       for (i = 0; i < num_responses; i++) {
+               if (response[i].status != 0)
+                       failed++;
+       }
+
+       /* If it failed on any nodes then we only unlock on
+          the nodes that succeeded */
+       if (failed) {
+               for (i = 0; i < num_responses; i++) {
+                       /* Unlock the ones that succeeded */
+                       if (response[i].status == 0) {
+                               status = lvm_cluster_request(CLVMD_CMD_UNLOCK,
+                                                            response[i].node,
+                                                            args, len,
+                                                            &unlock_response,
+                                                            &num_unlock_responses);
+                               if (status) {
+                                       if (verbosity)
+                                               fprintf(stderr,
+                                                       "cluster command to node %s failed: %s\n",
+                                                       response[i].node,
+                                                       strerror(errno));
+                               } else if (unlock_response[0].status != 0) {
+                                       if (verbosity > 1)
+                                               fprintf(stderr,
+                                                       "unlock on node %s failed: %s\n",
+                                                       response[i].node,
+                                                       strerror(unlock_response
+                                                                [0].status));
+                               }
+                               lvm_cluster_free_request(unlock_response);
+                       } else {
+                               if (verbosity)
+                                       fprintf(stderr,
+                                               "command on node %s failed: '%s' - will be left locked\n",
+                                               response[i].node,
+                                               strerror(response[i].status));
+                       }
+               }
+       } else {
+               /* All OK, we can do a full cluster unlock */
+               status = lvm_cluster_request(CLVMD_CMD_UNLOCK,
+                                            "",
+                                            args, len,
+                                            &unlock_response,
+                                            &num_unlock_responses);
+               if (status) {
+                       if (verbosity > 1)
+                               fprintf(stderr, "cluster command failed: %s\n",
+                                       strerror(errno));
+               } else {
+                       for (i = 0; i < num_unlock_responses; i++) {
+                               if (unlock_response[i].status != 0) {
+                                       if (verbosity > 1)
+                                               fprintf(stderr,
+                                                       "unlock on node %s failed: %s\n",
+                                                       response[i].node,
+                                                       strerror(unlock_response
+                                                                [0].status));
+                               }
+                       }
+               }
+               lvm_cluster_free_request(unlock_response);
+       }
+       lvm_cluster_free_request(response);
+
+       return 0;
+}
diff --git a/daemons/clvmd/libclvm.h b/daemons/clvmd/libclvm.h

new file mode 100644 (file)

index 0000000..bd735ce
--- /dev/null
+++ b/daemons/clvmd/libclvm.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 1997-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LIBCLVM_H
+#define _LIBCLVM_H
+
+typedef struct lvm_response {
+       char node[255];
+       char *response;
+       int status;
+       int len;
+
+} lvm_response_t;
+
+extern int lvm_cluster_request(char cmd, const char *node, void *data, int len,
+                              lvm_response_t ** response, int *num);
+extern int lvm_cluster_write(char cmd, char *node, void *data, int len);
+extern int lvm_cluster_free_request(lvm_response_t * response);
+
+/* The "high-level" API */
+extern int lvm_lock_for_cluster(char scope, char *name, int verbosity);
+extern int lvm_unlock_for_cluster(char scope, char *name, int verbosity);
+
+#endif
diff --git a/daemons/clvmd/lvm-functions.c b/daemons/clvmd/lvm-functions.c

new file mode 100644 (file)

index 0000000..400d33f
--- /dev/null
+++ b/daemons/clvmd/lvm-functions.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <syslog.h>
+#include <assert.h>
+
+#include "libdlm.h"
+#include "clvm.h"
+#include "clvmd-comms.h"
+#include "clvmd.h"
+#include "lvm-functions.h"
+
+/* LVM2 headers */
+#include "toolcontext.h"
+#include "log.h"
+#include "activate.h"
+#include "hash.h"
+#include "locking.h"
+
+static struct cmd_context *cmd = NULL;
+static struct hash_table *lv_hash = NULL;
+
+struct lv_info {
+       int lock_id;
+       int lock_mode;
+};
+
+/* Return the mode a lock is currently held at (or -1 if not held) */
+static int get_current_lock(char *resource)
+{
+       struct lv_info *lvi;
+
+       lvi = hash_lookup(lv_hash, resource);
+       if (lvi) {
+               return lvi->lock_mode;
+       } else {
+               return -1;
+       }
+}
+
+/* Called at shutdown to tidy the lockspace */
+void unlock_all()
+{
+       struct hash_node *v;
+       hash_iterate(v, lv_hash) {
+               struct lv_info *lvi = hash_get_data(lv_hash, v);
+
+               sync_unlock(hash_get_key(lv_hash, v), lvi->lock_id);
+       }
+}
+
+/* Gets a real lock and keeps the info in the hash table */
+int hold_lock(char *resource, int mode, int flags)
+{
+       int status;
+       int saved_errno;
+       struct lv_info *lvi;
+
+       flags &= LKF_NOQUEUE;   /* Only LKF_NOQUEUE is valid here */
+
+       lvi = hash_lookup(lv_hash, resource);
+       if (lvi) {
+               /* Already exists - convert it */
+               status =
+                   sync_lock(resource, mode, LKF_CONVERT | flags,
+                             &lvi->lock_id);
+               saved_errno = errno;
+               if (!status)
+                       lvi->lock_mode = mode;
+
+               if (status) {
+                       DEBUGLOG("hold_lock. convert to %d failed: %s\n", mode,
+                                strerror(errno));
+               }
+               errno = saved_errno;
+       } else {
+               lvi = malloc(sizeof(struct lv_info));
+               if (!lvi)
+                       return -1;
+
+               lvi->lock_mode = mode;
+               status = sync_lock(resource, mode, flags, &lvi->lock_id);
+               saved_errno = errno;
+               if (status) {
+                       free(lvi);
+                       DEBUGLOG("hold_lock. lock at %d failed: %s\n", mode,
+                                strerror(errno));
+               } else {
+                       hash_insert(lv_hash, resource, lvi);
+               }
+               errno = saved_errno;
+       }
+       return status;
+}
+
+/* Unlock and remove it from the hash table */
+int hold_unlock(char *resource)
+{
+       struct lv_info *lvi;
+       int status;
+       int saved_errno;
+
+       lvi = hash_lookup(lv_hash, resource);
+
+       if (!lvi) {
+               DEBUGLOG("hold_unlock, lock not already held\n");
+               return 0;
+       }
+
+       status = sync_unlock(resource, lvi->lock_id);
+       saved_errno = errno;
+       if (!status) {
+               hash_remove(lv_hash, resource);
+               free(lvi);
+       } else {
+               DEBUGLOG("hold_unlock. unlock failed(%d): %s\n", status,
+                        strerror(errno));
+       }
+
+       errno = saved_errno;
+       return status;
+}
+
+/* Watch the return codes here.
+   liblvm API functions return 1(true) for success, 0(false) for failure and don't set errno.
+   libdlm API functions return 0 for success, -1 for failure and do set errno.
+   These functions here return 0 for success or >0 for failure (where the retcode is errno)
+*/
+
+/* Activate LV exclusive or non-exclusive */
+static int do_activate_lv(char *resource, int mode)
+{
+       int oldmode;
+       int status;
+       int activate_lv;
+       struct lvinfo lvi;
+
+       /* Is it already open ? */
+       oldmode = get_current_lock(resource);
+       if (oldmode == mode) {
+               return 0;       /* Nothing to do */
+       }
+
+       /* Does the config file want us to activate this LV ? */
+       if (!lv_activation_filter(cmd, resource, &activate_lv))
+               return EIO;
+
+       if (!activate_lv)
+               return 0;       /* Success, we did nothing! */
+
+       /* Do we need to activate exclusively? */
+       if (activate_lv == 2)
+               mode = LKM_EXMODE;
+
+       /* OK, try to get the lock */
+       status = hold_lock(resource, mode, LKF_NOQUEUE);
+       if (status)
+               return errno;
+
+       /* If it's suspended then resume it */
+       if (!lv_info_by_lvid(cmd, resource, &lvi))
+               return EIO;
+
+       if (lvi.suspended)
+               if (!lv_resume(cmd, resource))
+                       return EIO;
+
+       /* Now activate it */
+       if (!lv_activate(cmd, resource))
+               return EIO;
+
+       return 0;
+}
+
+/* Resume the LV if it was active */
+static int do_resume_lv(char *resource)
+{
+       int oldmode;
+
+       /* Is it open ? */
+       oldmode = get_current_lock(resource);
+       if (oldmode == -1) {
+               DEBUGLOG("do_deactivate_lock, lock not already held\n");
+               return 0;       /* We don't need to do anything */
+       }
+
+       if (!lv_resume_if_active(cmd, resource))
+               return EIO;
+
+       return 0;
+}
+
+/* Suspend the device if active */
+static int do_suspend_lv(char *resource)
+{
+       int oldmode;
+       struct lvinfo lvi;
+
+       /* Is it open ? */
+       oldmode = get_current_lock(resource);
+       if (oldmode == -1) {
+               DEBUGLOG("do_suspend_lv, lock held at %d\n", oldmode);
+               return 0; /* Not active, so it's OK */
+       }
+
+       /* Only suspend it if it exists */
+       if (!lv_info_by_lvid(cmd, resource, &lvi))
+               return EIO;
+
+       if (lvi.exists) {
+               if (!lv_suspend_if_active(cmd, resource)) {
+                       return EIO;
+               }
+       }
+       return 0;
+}
+
+static int do_deactivate_lv(char *resource)
+{
+       int oldmode;
+       int status;
+
+       /* Is it open ? */
+       oldmode = get_current_lock(resource);
+       if (oldmode == -1) {
+               DEBUGLOG("do_deactivate_lock, lock not already held\n");
+               return 0;       /* We don't need to do anything */
+       }
+
+       if (!lv_deactivate(cmd, resource))
+               return EIO;
+
+       status = hold_unlock(resource);
+       if (status)
+               return errno;
+
+       return 0;
+}
+
+/* This is the LOCK_LV part that happens on all nodes in the cluster -
+   it is responsible for the interaction with device-mapper and LVM */
+int do_lock_lv(unsigned char command, unsigned char lock_flags, char *resource)
+{
+       int status = 0;
+
+       DEBUGLOG("do_lock_lv: resource '%s', cmd = 0x%x, flags = %d\n",
+                resource, command, lock_flags);
+
+       if (!cmd->config_valid || config_files_changed(cmd)) {
+               /* Reinitialise various settings inc. logging, filters */
+               if (!refresh_toolcontext(cmd)) {
+                       log_error("Updated config file invalid. Aborting.");
+                       return EINVAL;
+               }
+       }
+
+       switch (command) {
+       case LCK_LV_EXCLUSIVE:
+               status = do_activate_lv(resource, LKM_EXMODE);
+               break;
+
+       case LCK_LV_SUSPEND:
+               status = do_suspend_lv(resource);
+               break;
+
+       case LCK_UNLOCK:
+       case LCK_LV_RESUME:     /* if active */
+               status = do_resume_lv(resource);
+               break;
+
+       case LCK_LV_ACTIVATE:
+               status = do_activate_lv(resource, LKM_CRMODE);
+               break;
+
+       case LCK_LV_DEACTIVATE:
+               status = do_deactivate_lv(resource);
+               break;
+
+       default:
+               DEBUGLOG("Invalid LV command 0x%x\n", command);
+               status = EINVAL;
+               break;
+       }
+
+       /* clean the pool for another command */
+       pool_empty(cmd->mem);
+
+       DEBUGLOG("Command return is %d\n", status);
+       return status;
+}
+
+/* Functions to do on the local node only BEFORE the cluster-wide stuff above happens */
+int pre_lock_lv(unsigned char command, unsigned char lock_flags, char *resource)
+{
+       /* Nearly all the stuff happens cluster-wide. Apart from SUSPEND. Here we get the
+          lock out on this node (because we are the node modifying the metadata)
+          before suspending cluster-wide.
+        */
+       if (command == LCK_LV_SUSPEND) {
+               DEBUGLOG("pre_lock_lv: resource '%s', cmd = 0x%x, flags = %d\n",
+                        resource, command, lock_flags);
+
+               if (hold_lock(resource, LKM_PWMODE, LKF_NOQUEUE))
+                       return errno;
+       }
+       return 0;
+}
+
+/* Functions to do on the local node only AFTER the cluster-wide stuff above happens */
+int post_lock_lv(unsigned char command, unsigned char lock_flags,
+                char *resource)
+{
+       /* Opposite of above, done on resume after a metadata update */
+       if (command == LCK_LV_RESUME) {
+               int oldmode;
+
+               DEBUGLOG
+                   ("post_lock_lv: resource '%s', cmd = 0x%x, flags = %d\n",
+                    resource, command, lock_flags);
+
+               /* If the lock state is PW then restore it to what it was */
+               oldmode = get_current_lock(resource);
+               if (oldmode == LKM_PWMODE) {
+                       struct lvinfo lvi;
+
+                       if (!lv_info_by_lvid(cmd, resource, &lvi))
+                               return EIO;
+
+                       if (lvi.exists) {
+                               if (hold_lock(resource, LKM_CRMODE, 0))
+                                       return errno;
+                       } else {
+                               if (hold_unlock(resource))
+                                       return errno;
+                       }
+               }
+       }
+       return 0;
+}
+
+/* Check if a VG is un use by LVM1 so we don't stomp on it */
+int do_check_lvm1(char *vgname)
+{
+       int status;
+
+       status = check_lvm1_vg_inactive(cmd, vgname);
+
+       return status == 1 ? 0 : EBUSY;
+}
+
+/*
+ * Ideally, clvmd should be started before any LVs are active
+ * but this may not be the case...
+ * I suppose this also comes in handy if clvmd crashes, not that it would!
+ */
+static void *get_initial_state()
+{
+       char lv[64], vg[64], flags[25];
+       char uuid[65];
+       char line[255];
+       FILE *lvs =
+           popen
+           ("/sbin/lvm lvs --nolocking --noheadings -o vg_uuid,lv_uuid,lv_attr",
+            "r");
+
+       if (!lvs)
+               return NULL;
+
+       while (fgets(line, sizeof(line), lvs)) {
+               if (sscanf(line, "%s %s %s\n", vg, lv, flags) == 3) {
+                       /* States: s:suspended a:active S:dropped snapshot I:invalid snapshot */
+                       if (flags[4] == 'a' || flags[4] == 's') {       /* is it active or suspended? */
+                               /* Convert hyphen-separated UUIDs into one */
+                               memcpy(&uuid[0], &vg[0], 6);
+                               memcpy(&uuid[6], &vg[7], 4);
+                               memcpy(&uuid[10], &vg[12], 4);
+                               memcpy(&uuid[14], &vg[17], 4);
+                               memcpy(&uuid[18], &vg[22], 4);
+                               memcpy(&uuid[22], &vg[27], 4);
+                               memcpy(&uuid[26], &vg[32], 6);
+                               memcpy(&uuid[32], &lv[0], 6);
+                               memcpy(&uuid[38], &lv[7], 4);
+                               memcpy(&uuid[42], &lv[12], 4);
+                               memcpy(&uuid[46], &lv[17], 4);
+                               memcpy(&uuid[50], &lv[22], 4);
+                               memcpy(&uuid[54], &lv[27], 4);
+                               memcpy(&uuid[58], &lv[32], 6);
+                               uuid[64] = '\0';
+
+                               DEBUGLOG("getting initial lock for %s\n", uuid);
+                               hold_lock(uuid, LKM_CRMODE, LKF_NOQUEUE);
+                       }
+               }
+       }
+       fclose(lvs);
+       return NULL;
+}
+
+void init_lvhash()
+{
+       /* Create hash table for keeping LV locks & status */
+       lv_hash = hash_create(100);
+}
+
+/* Called to initialise the LVM context of the daemon */
+int init_lvm(void)
+{
+       if (!(cmd = create_toolcontext(NULL))) {
+               log_error("Failed to allocate command context");
+               return 0;
+       }
+
+       /* Use LOG_DAEMON for syslog messages instead of LOG_USER */
+       init_syslog(LOG_DAEMON);
+
+       get_initial_state();
+
+       return 1;
+}
diff --git a/daemons/clvmd/lvm-functions.h b/daemons/clvmd/lvm-functions.h

new file mode 100644 (file)

index 0000000..750eba9
--- /dev/null
+++ b/daemons/clvmd/lvm-functions.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* Functions in lvm-functions.c */
+
+#ifndef _LVM_FUNCTIONS_H
+#define _LVM_FUNCTIONS_H
+
+extern int pre_lock_lv(unsigned char lock_cmd, unsigned char lock_flags,
+                      char *resource);
+extern int do_lock_lv(unsigned char lock_cmd, unsigned char lock_flags,
+                     char *resource);
+extern int post_lock_lv(unsigned char lock_cmd, unsigned char lock_flags,
+                       char *resource);
+extern int do_check_lvm1(char *vgname);
+extern int init_lvm(void);
+extern void init_lvhash(void);
+
+extern int hold_unlock(char *resource);
+extern int hold_lock(char *resource, int mode, int flags);
+extern void unlock_all(void);
+
+#endif
diff --git a/daemons/clvmd/system-lv.c b/daemons/clvmd/system-lv.c

new file mode 100644 (file)

index 0000000..5b359cd
--- /dev/null
+++ b/daemons/clvmd/system-lv.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* Routines dealing with the System LV */
+
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/un.h>
+#include <sys/time.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/utsname.h>
+#include <syslog.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <signal.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <errno.h>
+#include <mntent.h>
+
+#include "libdlm.h"
+#include "log.h"
+#include "list.h"
+#include "locking.h"
+#include "system-lv.h"
+#include "clvmd-comms.h"
+#ifdef HAVE_CCS
+#include "ccs.h"
+#endif
+
+#define SYSTEM_LV_FILESYSTEM "ext2"
+#define SYSTEM_LV_MOUNTPOINT "/tmp/.clvmd-XXXXXX"
+
+extern char *config_filename(void);
+
+static char system_lv_name[PATH_MAX] = { '\0' };
+static char mount_point[PATH_MAX] = { '\0' };
+static int mounted = 0;
+static int mounted_rw = 0;
+static int lockid;
+static const char *lock_name = "CLVM_SYSTEM_LV";
+
+/* Look in /proc/mounts or (as a last resort) /etc/mtab to
+   see if the system-lv is mounted. If it is mounted and we
+   think it's not then abort because we don't have the right
+   lock status and we don't know what other processes are doing with it.
+
+   Returns 1 for mounted, 0 for not mounted so it matches the condition
+   of the "mounted" static variable above.
+*/
+static int is_really_mounted(void)
+{
+       FILE *mountfile;
+       struct mntent *ment;
+
+       mountfile = setmntent("/proc/mounts", "r");
+       if (!mountfile) {
+               mountfile = setmntent("/etc/mtab", "r");
+               if (!mountfile) {
+                       log_error("Unable to open /proc/mounts or /etc/mtab");
+                       return -1;
+               }
+       }
+
+       /* Look for system LV name in the file */
+       do {
+               ment = getmntent(mountfile);
+               if (ment) {
+                       if (strcmp(ment->mnt_fsname, system_lv_name) == 0) {
+                               endmntent(mountfile);
+                               return 1;
+                       }
+               }
+       }
+       while (ment);
+
+       endmntent(mountfile);
+       return 0;
+}
+
+/* Get the system LV name from the config file */
+static int find_system_lv(void)
+{
+       if (system_lv_name[0] == '\0') {
+#ifdef HAVE_CCS
+               int error;
+               ccs_node_t *ctree;
+
+               /* Read the cluster config file */
+               /* Open the config file */
+               error = open_ccs_file(&ctree, "clvm.ccs");
+               if (error) {
+                       perror("reading config file");
+                       return -1;
+               }
+
+               strcpy(system_lv_name, find_ccs_str(ctree,
+                                                   "cluster/systemlv", '/',
+                                                   "/dev/vg/system_lv"));
+
+               /* Finished with config file */
+               close_ccs_file(ctree);
+#else
+               if (getenv("CLVMD_SYSTEM_LV"))
+                       strcpy(system_lv_name, getenv("CLVMD_SYSTEM_LV"));
+               else
+                       return -1;
+#endif
+       }
+
+       /* See if it has been mounted outside our control */
+       if (is_really_mounted() != mounted) {
+               log_error
+                   ("The system LV state has been mounted/umounted outside the control of clvmd\n"
+                    "it cannot not be used for cluster communications until this is fixed.\n");
+               return -1;
+       }
+       return 0;
+}
+
+/* No prizes */
+int system_lv_umount(void)
+{
+       if (!mounted)
+               return 0;
+
+       if (umount(mount_point) < 0) {
+               log_error("umount of system LV (%s) failed: %m\n",
+                         system_lv_name);
+               return -1;
+       }
+
+       sync_unlock(lock_name, lockid);
+       mounted = 0;
+
+       /* Remove the mount point */
+       rmdir(mount_point);
+
+       return 0;
+}
+
+int system_lv_mount(int readwrite)
+{
+       int status;
+       int saved_errno;
+       int fd;
+
+       if (find_system_lv()) {
+               errno = EBUSY;
+               return -1;
+       }
+
+       /* Is it already mounted suitably? */
+       if (mounted) {
+               if (!readwrite || (readwrite && mounted_rw)) {
+                       return 0;
+               } else {
+                       /* Mounted RO and we need RW */
+                       if (system_lv_umount() < 0)
+                               return -1;
+               }
+       }
+
+       /* Randomize the mount point */
+       strcpy(mount_point, SYSTEM_LV_MOUNTPOINT);
+       fd = mkstemp(mount_point);
+       if (fd < 0) {
+               log_error("mkstemp for system LV mount point failed: %m\n");
+               return -1;
+       }
+
+       /* Race condition here but there's no mkstemp for directories */
+       close(fd);
+       unlink(mount_point);
+       mkdir(mount_point, 0600);
+
+       /* Make sure we have a system-lv lock */
+       status =
+           sync_lock(lock_name, (readwrite) ? LKM_EXMODE : LKM_CRMODE, 0,
+                     &lockid);
+       if (status < 0)
+               return -1;
+
+       /* Mount it */
+       if (mount(system_lv_name, mount_point, SYSTEM_LV_FILESYSTEM,
+                 MS_MGC_VAL | MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_SYNCHRONOUS
+                 | (readwrite ? 0 : MS_RDONLY), NULL) < 0) {
+               /* mount(2) returns EINVAL if the volume has no FS on it. So, if we want to
+                  write to it we try to make a filesystem in it and retry the mount */
+               if (errno == EINVAL && readwrite) {
+                       char cmd[256];
+
+                       log_error("Attempting mkfs on system LV device %s\n",
+                                 system_lv_name);
+                       snprintf(cmd, sizeof(cmd), "/sbin/mkfs -t %s %s",
+                                SYSTEM_LV_FILESYSTEM, system_lv_name);
+                       system(cmd);
+
+                       if (mount
+                           (system_lv_name, mount_point, SYSTEM_LV_FILESYSTEM,
+                            MS_MGC_VAL | MS_NOSUID | MS_NODEV | MS_NOEXEC |
+                            MS_SYNCHRONOUS | (readwrite ? 0 : MS_RDONLY),
+                            NULL) == 0)
+                               goto mounted;
+               }
+
+               saved_errno = errno;
+               log_error("mount of system LV (%s, %s, %s) failed: %m\n",
+                         system_lv_name, mount_point, SYSTEM_LV_FILESYSTEM);
+               sync_unlock(lock_name, lockid);
+               errno = saved_errno;
+               return -1;
+       }
+
+      mounted:
+/* Set the internal flags */
+       mounted = 1;
+       mounted_rw = readwrite;
+
+       return 0;
+}
+
+/* Erase *all* files in the root directory of the system LV.
+   This *MUST* be called with an appropriate lock held!
+   The LV is left mounted RW because it is assumed that the
+   caller wants to write something here after clearing some space */
+int system_lv_eraseall(void)
+{
+       DIR *dir;
+       struct dirent *ent;
+       char fname[PATH_MAX];
+
+       /* Must be mounted R/W */
+       system_lv_mount(1);
+
+       dir = opendir(mount_point);
+       if (!dir)
+               return -1;
+
+       while ((ent = readdir(dir))) {
+               struct stat st;
+               snprintf(fname, sizeof(fname), "%s/%s", mount_point,
+                        ent->d_name);
+
+               if (stat(fname, &st)) {
+                       if (S_ISREG(st.st_mode))
+                               unlink(fname);
+               }
+       }
+       closedir(dir);
+       return 0;
+}
+
+/* This is a "high-level" routine - it mounts the system LV, writes
+   the data into a file named after this node and then umounts the LV
+   again */
+int system_lv_write_data(char *data, ssize_t len)
+{
+       struct utsname nodeinfo;
+       char fname[PATH_MAX];
+       int outfile;
+       ssize_t thiswrite;
+       ssize_t written;
+
+       if (system_lv_mount(1))
+               return -1;
+
+       /* Build the file name we are goingto use. */
+       uname(&nodeinfo);
+       snprintf(fname, sizeof(fname), "%s/%s", mount_point, nodeinfo.nodename);
+
+       /* Open the file for output */
+       outfile = open(fname, O_RDWR | O_CREAT | O_TRUNC, 0600);
+       if (outfile < 0) {
+               int saved_errno = errno;
+               system_lv_umount();
+               errno = saved_errno;
+               return -1;
+       }
+
+       written = 0;
+       do {
+               thiswrite = write(outfile, data + written, len - written);
+               if (thiswrite > 0)
+                       written += thiswrite;
+
+       } while (written < len && thiswrite > 0);
+
+       close(outfile);
+
+       system_lv_umount();
+       return (thiswrite < 0) ? -1 : 0;
+}
+
+/* This is a "high-level" routine - it mounts the system LV, reads
+   the data from a named file and then umounts the LV
+   again */
+int system_lv_read_data(char *fname_base, char *data, ssize_t *len)
+{
+       char fname[PATH_MAX];
+       int outfile;
+       struct stat st;
+       ssize_t filesize;
+       ssize_t thisread;
+       ssize_t readbytes;
+
+       if (system_lv_mount(0))
+               return -1;
+
+       /* Build the file name we are going to use. */
+       snprintf(fname, sizeof(fname), "%s/%s", mount_point, fname_base);
+
+       /* Get the file size and stuff. Actually we only need the file size but
+          this will also check that the file exists */
+       if (stat(fname, &st) < 0) {
+               int saved_errno = errno;
+
+               log_error("stat of file %s on system LV failed: %m\n", fname);
+               system_lv_umount();
+               errno = saved_errno;
+               return -1;
+       }
+       filesize = st.st_size;
+
+       outfile = open(fname, O_RDONLY);
+       if (outfile < 0) {
+               int saved_errno = errno;
+
+               log_error("open of file %s on system LV failed: %m\n", fname);
+               system_lv_umount();
+               errno = saved_errno;
+               return -1;
+       }
+
+       readbytes = 0;
+       do {
+               thisread =
+                   read(outfile, data + readbytes, filesize - readbytes);
+               if (thisread > 0)
+                       readbytes += thisread;
+
+       } while (readbytes < filesize && thisread > 0);
+
+       close(outfile);
+
+       system_lv_umount();
+
+       *len = readbytes;
+       return (thisread < 0) ? -1 : 0;
+}
diff --git a/daemons/clvmd/system-lv.h b/daemons/clvmd/system-lv.h

new file mode 100644 (file)

index 0000000..b90ca44
--- /dev/null
+++ b/daemons/clvmd/system-lv.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _CLVM_SYSTEM_LV_H
+#define _CLVM_SYSTEM_LV_H
+
+/* Prototypes for System-LV functions */
+
+/* "low-level" functions */
+extern int system_lv_umount(void);
+extern int system_lv_mount(int readwrite);
+extern int system_lv_eraseall(void);
+
+/* "high-level" functions */
+extern int system_lv_write_data(char *data, ssize_t len);
+extern int system_lv_read_data(char *fname_base, char *data, ssize_t *len);
+
+#endif
diff --git a/daemons/clvmd/tcp-comms.c b/daemons/clvmd/tcp-comms.c

new file mode 100644 (file)

index 0000000..2e0406b
--- /dev/null
+++ b/daemons/clvmd/tcp-comms.c
@@ -0,0 +1,480 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  2002-2003  All rights reserved.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* This provides the inter-clvmd communications for a system without CMAN.
+   There is a listening TCP socket which accepts new connections in the
+   normal way.
+   It can also make outgoing connnections to the other clvmd nodes.
+*/
+
+
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <syslog.h>
+#include <netdb.h>
+#include <assert.h>
+
+#include "ccs.h"
+#include "clvm.h"
+#include "clvmd-comms.h"
+#include "clvmd.h"
+#include "clvmd-gulm.h"
+#include "hash.h"
+
+#define DEFAULT_TCP_PORT 21064
+
+static int listen_fd = -1;
+static int tcp_port;
+struct hash_table *sock_hash;
+
+static int get_tcp_port(int default_port);
+static int get_our_ip_address(char *addr, int *family);
+static int read_from_tcpsock(struct local_client *fd, char *buf, int len, char *csid,
+                            struct local_client **new_client);
+
+/* Called by init_cluster() to open up the listening socket */
+// TODO: IPv6 compat.
+int init_comms()
+{
+    struct sockaddr *addr = NULL;
+    struct sockaddr_in addr4;
+    struct sockaddr_in6 addr6;
+    int    addr_len;
+    int    family;
+    char   address[MAX_CSID_LEN];
+
+    sock_hash = hash_create(100);
+    tcp_port = get_tcp_port(DEFAULT_TCP_PORT);
+
+    /* Get IP address and IP type */
+    get_our_ip_address(address, &family);
+    if (family == AF_INET)
+    {
+       memcpy(&addr4.sin_addr, addr, sizeof(struct in_addr));
+       addr = (struct sockaddr *)&addr4;
+       addr4.sin_port = htons(tcp_port);
+       addr_len = sizeof(addr4);
+    }
+    else
+    {
+       memcpy(&addr6.sin6_addr, addr, sizeof(struct in6_addr));
+       addr = (struct sockaddr *)&addr6;
+       addr6.sin6_port = htons(tcp_port);
+       addr_len = sizeof(addr6);
+    }
+
+    listen_fd = socket(family, SOCK_STREAM, 0);
+
+    if (listen_fd < 0)
+    {
+       return -1;
+    }
+    else
+    {
+       int one = 1;
+       setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int));
+    }
+
+    addr->sa_family = family;
+
+    if (bind(listen_fd, addr, addr_len) < 0)
+    {
+       DEBUGLOG("Can't bind to port\n");
+       syslog(LOG_ERR, "Can't bind to port %d, is clvmd already running ?", tcp_port);
+       close(listen_fd);
+       return -1;
+    }
+
+    listen(listen_fd, 5);
+
+    return 0;
+}
+
+void tcp_remove_client(char *csid)
+ {
+    struct local_client *client;
+    DEBUGLOG("tcp_remove_client\n");
+
+    /* Don't actually close the socket here - that's the
+       job of clvmd.c whch will do the job when it notices the
+       other end has gone. We just need to remove the client(s) from
+       the hash table so we don't try to use it for sending any more */
+    client = hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN);
+    if (client)
+    {
+       hash_remove_binary(sock_hash, csid, MAX_CSID_LEN);
+    }
+
+    /* Look for a mangled one too */
+    csid[0] ^= 0x80;
+
+    client = hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN);
+    if (client)
+    {
+       hash_remove_binary(sock_hash, csid, MAX_CSID_LEN);
+    }
+
+    /* Put it back as we found it */
+    csid[0] ^= 0x80;
+}
+
+int alloc_client(int fd, char *csid, struct local_client **new_client)
+{
+    struct local_client *client;
+
+    DEBUGLOG("alloc_client %d csid = [%d.%d.%d.%d]\n", fd,csid[0],csid[1],csid[2],csid[3]);
+
+    /* Create a local_client and return it */
+    client = malloc(sizeof(struct local_client));
+    if (!client)
+    {
+       DEBUGLOG("malloc failed\n");
+       return -1;
+    }
+
+    memset(client, 0, sizeof(struct local_client));
+    client->fd = fd;
+    client->type = CLUSTER_DATA_SOCK;
+    client->callback = read_from_tcpsock;
+    if (new_client)
+       *new_client = client;
+
+    /* Add to our list of node sockets */
+    if (hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN))
+    {
+       DEBUGLOG("alloc_client mangling CSID for second connection\n");
+       /* This is a duplicate connection but we can't close it because
+          the other end may already have started sending.
+          So, we mangle the IP address and keep it, all sending will
+          go out of the main FD
+       */
+       csid[0] ^= 0x80;
+       client->bits.net.flags = 1; /* indicate mangled CSID */
+
+        /* If it still exists then kill the connection as we should only
+           ever have one incoming connection from each node */
+        if (hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN))
+        {
+           DEBUGLOG("Multiple incoming connections from node\n");
+            syslog(LOG_ERR, " Bogus incoming connection from %d.%d.%d.%d\n", csid[0],csid[1],csid[2],csid[3]);
+
+           free(client);
+            errno = ECONNREFUSED;
+            return -1;
+        }
+    }
+    hash_insert_binary(sock_hash, csid, MAX_CSID_LEN, client);
+
+    return 0;
+}
+
+int get_main_cluster_fd()
+{
+    return listen_fd;
+}
+
+
+/* Read on main comms (listen) socket, accept it */
+int cluster_fd_callback(struct local_client *fd, char *buf, int len, char *csid,
+                       struct local_client **new_client)
+{
+    int newfd;
+    struct sockaddr_in addr;
+    socklen_t addrlen = sizeof(addr);
+    int status;
+    char name[MAX_CLUSTER_MEMBER_NAME_LEN];
+
+    DEBUGLOG("cluster_fd_callback\n");
+    *new_client = NULL;
+    newfd = accept(listen_fd, (struct sockaddr *)&addr, &addrlen);
+
+    DEBUGLOG("cluster_fd_callback, newfd=%d (errno=%d)\n", newfd, errno);
+    if (!newfd)
+    {
+       syslog(LOG_ERR, "error in accept: %m");
+       errno = EAGAIN;
+       return -1; /* Don't return an error or clvmd will close the listening FD */
+    }
+
+    /* Check that the client is a member of the cluster
+       and reject if not.
+       // FIXME: IPv4 specific
+    */
+    if (name_from_csid((char *)&addr.sin_addr.s_addr, name) < 0)
+    {
+       char *ip = (char *)&addr.sin_addr.s_addr;
+       syslog(LOG_ERR, "Got connect from non-cluster node %d.%d.%d.%d\n",
+              ip[0], ip[1], ip[2], ip[3]);
+       DEBUGLOG("Got connect from non-cluster node %d.%d.%d.%d\n",
+                ip[0], ip[1], ip[2], ip[3]);
+       close(newfd);
+
+       errno = EAGAIN;
+       return -1;
+    }
+
+    status = alloc_client(newfd, (char *)&addr.sin_addr.s_addr, new_client);
+    if (status)
+    {
+       DEBUGLOG("cluster_fd_callback, alloc_client failed, status = %d\n", status);
+       close(newfd);
+       /* See above... */
+       errno = EAGAIN;
+       return -1;
+    }
+    DEBUGLOG("cluster_fd_callback, returning %d, %p\n", newfd, *new_client);
+    return newfd;
+}
+
+
+static int read_from_tcpsock(struct local_client *client, char *buf, int len, char *csid,
+                            struct local_client **new_client)
+{
+    struct sockaddr_in addr;
+    socklen_t slen = sizeof(addr);
+    int status;
+
+    DEBUGLOG("read_from_tcpsock fd %d\n", client->fd);
+    *new_client = NULL;
+
+    /* Get "csid" */
+    getpeername(client->fd, (struct sockaddr *)&addr, &slen);
+    memcpy(csid, &addr.sin_addr.s_addr, MAX_CSID_LEN);
+
+    status = read(client->fd, buf, len);
+
+    DEBUGLOG("read_from_tcpsock, status = %d(errno = %d)\n", status, errno);
+
+    /* Remove it from the hash table if there's an error, clvmd will
+       remove the socket from its lists and free the client struct */
+    if (status == 0 ||
+       (status < 0 && errno != EAGAIN && errno != EINTR))
+    {
+       char remcsid[MAX_CSID_LEN];
+
+       memcpy(remcsid, csid, MAX_CSID_LEN);
+       close(client->fd);
+
+       /* If the csid was mangled, then make sure we remove the right entry */
+       if (client->bits.net.flags)
+           remcsid[0] ^= 0x80;
+       hash_remove_binary(sock_hash, remcsid, MAX_CSID_LEN);
+
+       /* Tell cluster manager layer */
+       add_down_node(remcsid);
+    }
+    return status;
+}
+
+static int connect_csid(char *csid, struct local_client **newclient)
+{
+    int fd;
+    struct sockaddr_in addr;
+    int status;
+
+    DEBUGLOG("Connecting socket\n");
+    fd = socket(PF_INET, SOCK_STREAM, 0);
+
+    if (fd < 0)
+    {
+       syslog(LOG_ERR, "Unable to create new socket: %m");
+       return -1;
+    }
+
+    addr.sin_family = AF_INET;
+    memcpy(&addr.sin_addr.s_addr, csid, MAX_CSID_LEN);
+    addr.sin_port = htons(tcp_port);
+
+    DEBUGLOG("Connecting socket %d\n", fd);
+    if (connect(fd, (struct sockaddr *)&addr, sizeof(struct sockaddr_in)) < 0)
+    {
+       syslog(LOG_ERR, "Unable to connect to remote node: %m");
+       DEBUGLOG("Unable to connect to remote node: %s\n", strerror(errno));
+       close(fd);
+       return -1;
+    }
+
+    status = alloc_client(fd, csid, newclient);
+    if (status)
+       close(fd);
+    else
+       add_client(*newclient);
+
+    /* If we can connect to it, it must be running a clvmd */
+    add_up_node(csid);
+    return status;
+}
+
+/* Send a message to a known CSID */
+static int tcp_send_message(void *buf, int msglen, unsigned char *csid, const char *errtext)
+{
+    int status;
+    struct local_client *client;
+    char ourcsid[MAX_CSID_LEN];
+
+    assert(csid);
+
+    DEBUGLOG("tcp_send_message, csid = [%d.%d.%d.%d], msglen = %d\n", csid[0],csid[1],csid[2],csid[3], msglen);
+
+    /* Don't connect to ourself */
+    get_our_csid(ourcsid);
+    if (memcmp(csid, ourcsid, MAX_CSID_LEN) == 0)
+       return msglen;
+
+    client = hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN);
+    if (!client)
+    {
+       status = connect_csid(csid, &client);
+       if (status)
+           return -1;
+    }
+    DEBUGLOG("tcp_send_message, fd = %d\n", client->fd);
+
+    return write(client->fd, buf, msglen);
+}
+
+
+int cluster_send_message(void *buf, int msglen, char *csid, const char *errtext)
+{
+    int status=0;
+
+    DEBUGLOG("cluster send message, csid = %p, msglen = %d\n", csid, msglen);
+
+    /* If csid is NULL then send to all known (not just connected) nodes */
+    if (!csid)
+    {
+       void *context = NULL;
+       char loop_csid[MAX_CSID_LEN];
+
+       /* Loop round all gulm-known nodes */
+       while (get_next_node_csid(&context, loop_csid))
+       {
+           status = tcp_send_message(buf, msglen, loop_csid, errtext);
+           if (status == 0 ||
+               (status < 0 && (errno == EAGAIN || errno == EINTR)))
+               break;
+       }
+    }
+    else
+    {
+
+       status = tcp_send_message(buf, msglen, csid, errtext);
+    }
+    return status;
+}
+
+static int get_tcp_port(int default_port)
+{
+    int ccs_handle;
+    int port = default_port;
+    char *portstr;
+
+    ccs_handle = ccs_connect();
+    if (ccs_handle)
+    {
+       return port;
+    }
+
+    if (!ccs_get(ccs_handle, "//clvm/@port", &portstr))
+    {
+       port = atoi(portstr);
+       free(portstr);
+
+       if (port <= 0 && port >= 65536)
+           port = default_port;
+    }
+    ccs_disconnect(ccs_handle);
+
+    DEBUGLOG("Using port %d for communications\n", port);
+    return port;
+}
+
+/* To get our own IP address we get the locally bound address of the
+   socket that's talking to GULM in the assumption(eek) that it will
+   be on the "right" network in a multi-homed system */
+static int get_our_ip_address(char *addr, int *family)
+{
+    /* Use a sockaddr_in6 to make sure it's big enough */
+    struct sockaddr_in6 saddr;
+    int socklen = sizeof(saddr);
+
+    if (!getsockname(gulm_fd(), (struct sockaddr *)&saddr, &socklen))
+    {
+       if (saddr.sin6_family == AF_INET6)
+       {
+           memcpy(addr, &saddr.sin6_addr, sizeof(saddr.sin6_addr));
+       }
+       else
+       {
+           struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr;
+           memcpy(addr, &sin4->sin_addr, sizeof(sin4->sin_addr));
+       }
+       return 0;
+    }
+    return -1;
+}
+
+/* Public version of above for those that don't care what protocol
+   we're using */
+void get_our_csid(char *csid)
+{
+    static char our_csid[MAX_CSID_LEN];
+    static int got_csid = 0;
+
+    if (!got_csid)
+    {
+       int family;
+
+       memset(our_csid, 0, sizeof(our_csid));
+       if (get_our_ip_address(our_csid, &family))
+       {
+           got_csid = 1;
+       }
+    }
+    memcpy(csid, our_csid, MAX_CSID_LEN);
+}
+
+/* Get someone else's IP address from DNS */
+int get_ip_address(char *node, char *addr)
+{
+    struct hostent *he;
+
+    memset(addr, 0, MAX_CSID_LEN);
+
+    // TODO: what do we do about multi-homed hosts ???
+    // CCSs ip_interfaces solved this but some bugger removed it.
+
+    /* Try IPv6 first. The man page for gethostbyname implies that
+       it will lookup ip6 & ip4 names, but it seems not to */
+    he = gethostbyname2(node, AF_INET6);
+    if (!he)
+       he = gethostbyname2(node, AF_INET);
+    if (!he)
+       return -1;
+
+    /* For IPv4 address just use the lower 4 bytes */
+    memcpy(&addr, he->h_addr_list[0],
+          he->h_length);
+
+    return 0;
+}
diff --git a/daemons/clvmd/tcp-comms.h b/daemons/clvmd/tcp-comms.h

new file mode 100644 (file)

index 0000000..8dafd44
--- /dev/null
+++ b/daemons/clvmd/tcp-comms.h
@@ -0,0 +1,7 @@
+#include <netinet/in.h>
+
+#define MAX_CLUSTER_MESSAGE 1600
+#define MAX_CSID_LEN sizeof(struct in6_addr)
+#define MAX_CLUSTER_MEMBER_NAME_LEN 128
+
+extern int init_comms(void);
diff --git a/include/.symlinks b/include/.symlinks

index 74987de7c96d686de81e52704c9207d73d3dc2ac..54d27bc6a87c187616263f5f76172f39a2b54df5 100644 (file)
--- a/include/.symlinks
+++ b/include/.symlinks
@@ -1,3 +1,4 @@
+../daemons/clvmd/clvm.h
  ../lib/activate/activate.h
  ../lib/activate/targets.h
  ../lib/cache/lvmcache.h
diff --git a/lib/Makefile.in b/lib/Makefile.in

index f0da66e7a74bfcbf4b98ab16dddc55953cd9cc28..75ff735b92b2751bc7fb46d13490fd0fde93e9bf 100644 (file)
--- a/lib/Makefile.in
+++ b/lib/Makefile.in
@@ -104,6 +104,14 @@ ifeq ("@POOL@", "internal")
         format_pool/pool_label.c
  endif
  
+ifeq ("@CLUSTER@", "internal")
+  SOURCES += locking/cluster_locking.c
+endif
+
+ifeq ("@CLUSTER@", "shared")
+  SUBDIRS += locking
+endif
+
  ifeq ("@SNAPSHOTS@", "internal")
    SOURCES += snapshot/snapshot.c
  endif
diff --git a/lib/locking/Makefile.in b/lib/locking/Makefile.in

new file mode 100644 (file)

index 0000000..d28cfbf
--- /dev/null
+++ b/lib/locking/Makefile.in
@@ -0,0 +1,32 @@
+#
+# Copyright (C) 2003-2004 Sistina Software, Inc. All rights reserved.
+# Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+#
+# This file is part of the LVM2.
+#
+# This copyrighted material is made available to anyone wishing to use,
+# modify, copy, or redistribute it subject to the terms and conditions
+# of the GNU General Public License v.2.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+
+SOURCES = cluster_locking.c
+
+LIB_SHARED = liblvm2clusterlock.so
+
+include $(top_srcdir)/make.tmpl
+
+.PHONY: install
+
+install: liblvm2clusterlock.so
+       $(INSTALL) -D $(OWNER) $(GROUP) -m 555 $(STRIP) $< \
+               $(libdir)/liblvm2clusterlock.so.$(LIB_VERSION)
+       $(LN_S) -f liblvm2clusterlock.so.$(LIB_VERSION) \
+               $(libdir)/liblvm2clusterlock.so
+
diff --git a/lib/locking/cluster_locking.c b/lib/locking/cluster_locking.c

new file mode 100644 (file)

index 0000000..d9cab2d
--- /dev/null
+++ b/lib/locking/cluster_locking.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of LVM2.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * Locking functions for LVM.
+ * The main purpose of this part of the library is to serialise LVM
+ * management operations across a cluster.
+ */
+
+#include "lib.h"
+#include "clvm.h"
+#include "lvm-string.h"
+#include "locking.h"
+#include "locking_types.h"
+
+#include <stddef.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#ifndef CLUSTER_LOCKING_INTERNAL
+int lock_resource(struct cmd_context *cmd, const char *resource, int flags);
+void locking_end(void);
+int locking_init(int type, struct config_tree *cf, uint32_t *flags);
+#endif
+
+typedef struct lvm_response {
+       char node[255];
+       char *response;
+       int status;
+       int len;
+} lvm_response_t;
+
+/*
+ * This gets stuck at the start of memory we allocate so we
+ * can sanity-check it at deallocation time
+ */
+#define LVM_SIGNATURE 0x434C564D
+
+/*
+ * NOTE: the LVMD uses the socket FD as the client ID, this means
+ * that any client that calls fork() will inherit the context of
+ * it's parent.
+ */
+static int _clvmd_sock = -1;
+
+/* FIXME Install SIGPIPE handler? */
+
+/* Open connection to the Cluster Manager daemon */
+static int _open_local_sock(void)
+{
+       int local_socket;
+       struct sockaddr_un sockaddr;
+
+       /* Open local socket */
+       if ((local_socket = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) {
+               log_error("Local socket creation failed: %s", strerror(errno));
+               return -1;
+       }
+
+       memset(&sockaddr, 0, sizeof(sockaddr));
+       memcpy(sockaddr.sun_path, CLVMD_SOCKNAME, sizeof(CLVMD_SOCKNAME));
+
+       sockaddr.sun_family = AF_UNIX;
+
+       if (connect(local_socket,(struct sockaddr *) &sockaddr,
+                   sizeof(sockaddr))) {
+               int saved_errno = errno;
+
+               log_error("connect() failed on local socket: %s",
+                         strerror(errno));
+               if (close(local_socket))
+                       stack;
+
+               errno = saved_errno;
+               return -1;
+       }
+
+       return local_socket;
+}
+
+/* Send a request and return the status */
+static int _send_request(char *inbuf, int inlen, char **retbuf)
+{
+       char outbuf[PIPE_BUF];
+       struct clvm_header *outheader = (struct clvm_header *) outbuf;
+       int len;
+       int off;
+       int buflen;
+       int err;
+
+       /* Send it to CLVMD */
+ rewrite:
+       if ( (err = write(_clvmd_sock, inbuf, inlen)) != inlen) {
+               if (err == -1 && errno == EINTR)
+                       goto rewrite;
+               log_error("Error writing data to clvmd: %s", strerror(errno));
+               return 0;
+       }
+
+       /* Get the response */
+ reread:
+       if ((len = read(_clvmd_sock, outbuf, sizeof(struct clvm_header))) < 0) {
+               if (errno == EINTR)
+                       goto reread;
+               log_error("Error reading data from clvmd: %s", strerror(errno));
+               return 0;
+       }
+
+       if (len == 0) {
+               log_error("EOF reading CLVMD");
+               errno = ENOTCONN;
+               return 0;
+       }
+
+       /* Allocate buffer */
+       buflen = len + outheader->arglen;
+       *retbuf = dbg_malloc(buflen);
+       if (!*retbuf) {
+               errno = ENOMEM;
+               return 0;
+       }
+
+       /* Copy the header */
+       memcpy(*retbuf, outbuf, len);
+       outheader = (struct clvm_header *) *retbuf;
+
+       /* Read the returned values */
+       off = 1;                /* we've already read the first byte */
+
+       while (off < outheader->arglen && len > 0) {
+               len = read(_clvmd_sock, outheader->args + off,
+                          buflen - off - offsetof(struct clvm_header, args));
+               if (len > 0)
+                       off += len;
+       }
+
+       /* Was it an error ? */
+       if (outheader->status < 0) {
+               errno = -outheader->status;
+               log_error("cluster send request failed: %s", strerror(errno));
+               return 0;
+       }
+
+       return 1;
+}
+
+/* Build the structure header and parse-out wildcard node names */
+static void _build_header(struct clvm_header *head, int cmd, const char *node,
+                         int len)
+{
+       head->cmd = cmd;
+       head->status = 0;
+       head->flags = 0;
+       head->clientid = 0;
+       head->arglen = len;
+
+       if (node) {
+               /*
+                * Allow a couple of special node names:
+                * "*" for all nodes,
+                * "." for the local node only
+                */
+               if (strcmp(node, "*") == 0) {
+                       head->node[0] = '\0';
+               } else if (strcmp(node, ".") == 0) {
+                       head->node[0] = '\0';
+                       head->flags = CLVMD_FLAG_LOCAL;
+               } else
+                       strcpy(head->node, node);
+       } else
+               head->node[0] = '\0';
+}
+
+/*
+ * Send a message to a(or all) node(s) in the cluster and wait for replies
+ */
+static int _cluster_request(char cmd, const char *node, void *data, int len,
+                          lvm_response_t ** response, int *num)
+{
+       char outbuf[sizeof(struct clvm_header) + len + strlen(node) + 1];
+       int *outptr;
+       char *inptr;
+       char *retbuf = NULL;
+       int status;
+       int i;
+       int num_responses = 0;
+       struct clvm_header *head = (struct clvm_header *) outbuf;
+       lvm_response_t *rarray;
+
+       *num = 0;
+
+       if (_clvmd_sock == -1)
+               _clvmd_sock = _open_local_sock();
+
+       if (_clvmd_sock == -1)
+               return 0;
+
+       _build_header(head, cmd, node, len);
+       memcpy(head->node + strlen(head->node) + 1, data, len);
+
+       status = _send_request(outbuf, sizeof(struct clvm_header) +
+                             strlen(head->node) + len, &retbuf);
+       if (!status)
+               goto out;
+
+       /* Count the number of responses we got */
+       head = (struct clvm_header *) retbuf;
+       inptr = head->args;
+       while (inptr[0]) {
+               num_responses++;
+               inptr += strlen(inptr) + 1;
+               inptr += sizeof(int);
+               inptr += strlen(inptr) + 1;
+       }
+
+       /*
+        * Allocate response array.
+        * With an extra pair of INTs on the front to sanity
+        * check the pointer when we are given it back to free
+        */
+       outptr = dbg_malloc(sizeof(lvm_response_t) * num_responses +
+                           sizeof(int) * 2);
+       if (!outptr) {
+               errno = ENOMEM;
+               status = 0;
+               goto out;
+       }
+
+       *response = (lvm_response_t *) (outptr + 2);
+       outptr[0] = LVM_SIGNATURE;
+       outptr[1] = num_responses;
+       rarray = *response;
+
+       /* Unpack the response into an lvm_response_t array */
+       inptr = head->args;
+       i = 0;
+       while (inptr[0]) {
+               strcpy(rarray[i].node, inptr);
+               inptr += strlen(inptr) + 1;
+
+               rarray[i].status = *(int *) inptr;
+               inptr += sizeof(int);
+
+               rarray[i].response = dbg_malloc(strlen(inptr) + 1);
+               if (rarray[i].response == NULL) {
+                       /* Free up everything else and return error */
+                       int j;
+                       for (j = 0; j < i; j++)
+                               dbg_free(rarray[i].response);
+                       free(outptr);
+                       errno = ENOMEM;
+                       status = -1;
+                       goto out;
+               }
+
+               strcpy(rarray[i].response, inptr);
+               rarray[i].len = strlen(inptr);
+               inptr += strlen(inptr) + 1;
+               i++;
+       }
+       *num = num_responses;
+       *response = rarray;
+
+      out:
+       if (retbuf)
+               dbg_free(retbuf);
+
+       return status;
+}
+
+/* Free reply array */
+static int _cluster_free_request(lvm_response_t * response)
+{
+       int *ptr = (int *) response - 2;
+       int i;
+       int num;
+
+       /* Check it's ours to free */
+       if (response == NULL || *ptr != LVM_SIGNATURE) {
+               errno = EINVAL;
+               return 0;
+       }
+
+       num = ptr[1];
+
+       for (i = 0; i < num; i++) {
+               dbg_free(response[i].response);
+       }
+
+       dbg_free(ptr);
+
+       return 1;
+}
+
+static int _lock_for_cluster(unsigned char cmd, unsigned int flags, char *name)
+{
+       int status;
+       int i;
+       char *args;
+       const char *node = "";
+       int len;
+       int saved_errno = errno;
+       lvm_response_t *response = NULL;
+       int num_responses;
+
+       assert(name);
+
+       len = strlen(name) + 3;
+       args = alloca(len);
+       strcpy(args + 2, name);
+
+       args[0] = flags & 0xBF; /* Maskoff LOCAL flag */
+       args[1] = 0;            /* Not used now */
+
+       /*
+        * VG locks are just that: locks, and have no side effects
+        * so we only need to do them on the local node because all
+        * locks are cluster-wide.
+        * Also, if the lock is exclusive it makes no sense to try to 
+        * acquire it on all nodes, so just do that on the local node too.
+        */
+       if (cmd == CLVMD_CMD_LOCK_VG ||
+           (flags & LCK_TYPE_MASK) == LCK_EXCL ||
+           (flags & LCK_LOCAL))
+               node = ".";
+
+       status = _cluster_request(cmd, node, args, len,
+                                 &response, &num_responses);
+
+       /* If any nodes were down then display them and return an error */
+       for (i = 0; i < num_responses; i++) {
+               if (response[i].status == -EHOSTDOWN) {
+                       log_error("clvmd not running on node %s",
+                                 response[i].node);
+                       status = 0;
+               } else if (response[i].status) {
+                       log_error("Error locking on node %s: %s",
+                                 response[i].node,
+                                 response[i].response[0] ?
+                                       response[i].response :
+                                       strerror(response[i].status));
+                       status = 0;
+               }
+       }
+
+       saved_errno = errno;
+       _cluster_free_request(response);
+       errno = saved_errno;
+
+       return status;
+}
+
+/* API entry point for LVM */
+#ifdef CLUSTER_LOCKING_INTERNAL
+static int _lock_resource(struct cmd_context *cmd, const char *resource,
+                         int flags)
+#else
+int lock_resource(struct cmd_context *cmd, const char *resource, int flags)
+#endif
+{
+       char lockname[PATH_MAX];
+       int cluster_cmd = 0;
+
+       assert(strlen(resource) < sizeof(lockname));
+
+       switch (flags & LCK_SCOPE_MASK) {
+       case LCK_VG:
+               /* If the VG name is empty then lock the unused PVs */
+               if (!resource || !*resource)
+                       lvm_snprintf(lockname, sizeof(lockname), "P_orphans");
+               else
+                       lvm_snprintf(lockname, sizeof(lockname), "V_%s",
+                                    resource);
+
+               cluster_cmd = CLVMD_CMD_LOCK_VG;
+               flags &= LCK_TYPE_MASK;
+               break;
+
+       case LCK_LV:
+               cluster_cmd = CLVMD_CMD_LOCK_LV;
+               strcpy(lockname, resource);
+               flags &= 0xffdf;        /* Mask off HOLD flag */
+               break;
+
+       default:
+               log_error("Unrecognised lock scope: %d",
+                         flags & LCK_SCOPE_MASK);
+               return 0;
+       }
+
+       /* Send a message to the cluster manager */
+       log_very_verbose("Locking %s at 0x%x", lockname, flags);
+
+       return _lock_for_cluster(cluster_cmd, flags, lockname);
+}
+
+#ifdef CLUSTER_LOCKING_INTERNAL
+static void _locking_end(void)
+#else
+void locking_end(void)
+#endif
+{
+       if (_clvmd_sock != -1 && close(_clvmd_sock))
+               stack;
+
+       _clvmd_sock = -1;
+}
+
+#ifdef CLUSTER_LOCKING_INTERNAL
+static void _reset_locking(void)
+#else
+void reset_locking(void)
+#endif
+{
+       if (close(_clvmd_sock))
+               stack;
+
+       _clvmd_sock = _open_local_sock();
+       if (_clvmd_sock == -1)
+               stack;
+}
+
+#ifdef CLUSTER_LOCKING_INTERNAL
+int init_cluster_locking(struct locking_type *locking, struct config_tree *cft)
+{
+       locking->lock_resource = _lock_resource;
+       locking->fin_locking = _locking_end;
+       locking->reset_locking = _reset_locking;
+       locking->flags = LCK_PRE_MEMLOCK;
+
+       _clvmd_sock = _open_local_sock();
+       if (_clvmd_sock == -1)
+               return 0;
+
+       return 1;
+}
+#else
+int locking_init(int type, struct config_tree *cf, uint32_t *flags)
+{
+       _clvmd_sock = _open_local_sock();
+       if (_clvmd_sock == -1)
+               return 0;
+
+       /* Ask LVM to lock memory before calling us */
+       *flags |= LCK_PRE_MEMLOCK;
+
+       return 1;
+}
+#endif
diff --git a/lib/locking/locking.c b/lib/locking/locking.c

index f4fa45eb3d7d24b3d653e5cd08bf36b1195b44a6..2c5ab1b9e28b84695ce14fd465fcc1d1aebefbf7 100644 (file)
--- a/lib/locking/locking.c
+++ b/lib/locking/locking.c
@@ -145,6 +145,14 @@ int init_locking(int type, struct config_tree *cft)
                 return 1;
  #endif
  
+#ifdef CLUSTER_LOCKING_INTERNAL
+       case 3:
+               if (!init_cluster_locking(&_locking, cft))
+                       break;
+               log_very_verbose("Cluster locking enabled.");
+               return 1;
+#endif
+
         default:
                 log_error("Unknown locking type requested.");
                 return 0;
diff --git a/lib/locking/locking_types.h b/lib/locking/locking_types.h

index de8d944908e3b3a1b5a85e54cc3024341f5210c8..441e2c309f65c16abe74e2102789dbaff66b32d4 100644 (file)
--- a/lib/locking/locking_types.h
+++ b/lib/locking/locking_types.h
@@ -40,3 +40,4 @@ int init_no_locking(struct locking_type *locking, struct config_tree *cf);
  int init_file_locking(struct locking_type *locking, struct config_tree *cf);
  
  int init_external_locking(struct locking_type *locking, struct config_tree *cf);
+int init_cluster_locking(struct locking_type *locking, struct config_tree *cf);
diff --git a/scripts/clvmd_fix_conf.sh b/scripts/clvmd_fix_conf.sh

new file mode 100644 (file)

index 0000000..9e363d5
--- /dev/null
+++ b/scripts/clvmd_fix_conf.sh
@@ -0,0 +1,154 @@
+#!/bin/sh
+#
+# Edit an lvm.conf file to enable cluster locking.
+#
+# $1 is the directory where the locking library is installed.
+# $2 (optional) is the config file
+# $3 (optional) is the locking library name
+#
+#
+PREFIX=$1
+LVMCONF=$2
+LIB=$3
+
+if [ -z "$PREFIX" ]
+then
+  echo "usage: $0 <prefix> [<config file>] [<library>]"
+  echo ""
+  echo "<prefix>      location of the cluster locking shared library. (no default)"
+  echo "<config file> name of the LVM config file (default: /etc/lvm/lvm.conf)"
+  echo "<library>     name of the shared library (default: liblvm2clusterlock.so)"
+  echo ""
+  exit 0
+fi
+
+[ -z "$LVMCONF" ] && LVMCONF="/etc/lvm/lvm.conf"
+[ -z "$LIB" ] && LIB="liblvm2clusterlock.so"
+
+if [ "${PREFIX:0:1}" != "/" ]
+then
+  echo "Prefix must be an absolute path name (starting with a /)"
+  exit 12
+fi
+
+if [ ! -f "$LVMCONF" ]
+then
+  echo "$LVMCONF does not exist"
+  exit 10
+fi
+
+if [ ! -f "$PREFIX/$LIB" ]
+then
+  echo "$PREFIX/$LIB does not exist, did you do a \"make install\" ?"
+  exit 11
+fi
+
+
+SCRIPTFILE=`mktemp -t lvmscript.XXXXXXXXXX`
+TMPFILE=`mktemp -t lvmtmp.XXXXXXXXXX`
+
+
+# Flags so we know which parts of the file we can replace and which need
+# adding. These are return codes from grep, so zero means it IS present!
+have_type=1
+have_dir=1
+have_library=1
+have_global=1
+
+grep -q '^[[:blank:]]*locking_type[[:blank:]]*=' $LVMCONF
+have_type=$?
+
+grep -q '^[[:blank:]]*library_dir[[:blank:]]*=' $LVMCONF
+have_dir=$?
+
+grep -q '^[[:blank:]]*locking_library[[:blank:]]*=' $LVMCONF
+have_library=$?
+
+# Those options are in section "global {" so we must have one if any are present.
+if [ "$have_type" = "0" -o "$have_dir" = "0" -o "$have_library" = "0" ]
+then
+
+    # See if we can find it...
+    grep -q '^[[:blank:]]*global[[:blank:]]*{' $LVMCONF
+    have_global=$?
+    
+    if [ "$have_global" = "1" ] 
+       then
+       echo "global keys but no 'global {' found, can't edit file"
+       exit 12
+    fi
+fi
+
+# So if we don't have "global {" we need to create one and 
+# populate it
+
+if [ "$have_global" = "1" ]
+then
+    cat $LVMCONF - <<EOF > $TMPFILE
+global {
+    # Enable locking for cluster LVM
+    locking_type = 2
+    library_dir = "$PREFIX"
+    locking_library = "$LIB"
+}
+EOF
+    if [ $? != 0 ]
+    then
+       echo "failed to create temporary config file, $LVMCONF not updated"
+       exit 1
+    fi
+else
+    #
+    # We have a "global {" section, so add or replace the
+    # locking entries as appropriate
+    #
+
+    if [ "$have_type" = "0" ] 
+    then
+       SEDCMD=" s/^[[:blank:]]*locking_type[[:blank:]]*=.*/\ \ \ \ locking_type = 2/g"
+    else
+       SEDCMD=" /global[[:blank:]]*{/a\ \ \ \ locking_type = 2"
+    fi
+    
+    if [ "$have_dir" = "0" ] 
+    then
+       SEDCMD="${SEDCMD}\ns'^[[:blank:]]*library_dir[[:blank:]]*=.*'\ \ \ \ library_dir = \"$PREFIX\"'g"
+    else
+       SEDCMD="${SEDCMD}\n/global[[:blank:]]*{/a\ \ \ \ library_dir = \"$PREFIX\""
+    fi
+
+    if [ "$have_library" = "0" ] 
+    then
+       SEDCMD="${SEDCMD}\ns/^[[:blank:]]*locking_library[[:blank:]]*=.*/\ \ \ \ locking_library = \"$LIB\"/g"
+    else
+       SEDCMD="${SEDCMD}\n/global[[:blank:]]*{/a\ \ \ \ locking_library = \"$LIB\""
+    fi
+
+    echo -e $SEDCMD > $SCRIPTFILE
+    sed  <$LVMCONF >$TMPFILE -f $SCRIPTFILE
+    if [ $? != 0 ]
+    then
+       echo "sed failed, $LVMCONF not updated"
+       exit 1
+    fi
+fi
+
+# Now we have a suitably editted config file in a temp place,
+# backup the original and copy our new one into place.
+
+cp $LVMCONF $LVMCONF.nocluster
+if [ $? != 0 ]
+    then
+    echo "failed to backup old config file, $LVMCONF not updated"
+    exit 2
+fi
+
+cp $TMPFILE $LVMCONF
+if [ $? != 0 ]
+    then
+    echo "failed to copy new config file into place, check $LVMCONF is still OK"
+    exit 3
+fi
+
+rm -f $SCRIPTFILE $TMPFILE
+
diff --git a/scripts/clvmd_init b/scripts/clvmd_init

new file mode 100755 (executable)

index 0000000..31eb8ce
--- /dev/null
+++ b/scripts/clvmd_init
@@ -0,0 +1,90 @@
+#!/bin/bash
+#
+#      /etc/rc.d/init.d/clvmd
+#
+# Starts the clvm daemon
+# NOTE: These startup levels may not be right yet - it depends on where
+#       the rest of the cluster startup goes.
+#
+# chkconfig: 345 72 5
+# description: distributes LVM commands in a clustered environment. \
+#    a clvmd must be run on all nodes in a cluster for clustered LVM \
+#    operations to work.
+# processname: clvmd
+
+# Source function library.
+. /etc/init.d/functions
+
+BINARY=/usr/sbin/clvmd
+LOCKFILE=/var/lock/subsys/clvmd
+
+test -x "$BINARY" || exit 0
+
+RETVAL=0
+
+#
+#      See how we were called.
+#
+
+prog="clvmd"
+
+start() {
+       # Check if clvmd is already running
+       if [ ! -f "$LOCKFILE" ]; then
+           echo -n $"Starting $prog: "
+           daemon $BINARY
+           RETVAL=$?
+           [ $RETVAL -eq 0 ] && touch $LOCKFILE
+           echo
+       fi
+       return $RETVAL
+}
+
+stop() {
+       echo -n $"Stopping $prog: "
+       killproc $BINARY
+       RETVAL=$?
+       [ $RETVAL -eq 0 ] && rm -f $LOCKFILE
+       echo
+        return $RETVAL
+}
+
+
+restart() {
+       stop
+       start
+}      
+
+reload() {
+       restart
+}      
+
+status_clvm() {
+       status $BINARY
+}
+
+case "$1" in
+start)
+       start
+       ;;
+stop)
+       stop
+       ;;
+reload|restart)
+       restart
+       ;;
+condrestart)
+       if [ -f $LOCKFILE ]; then
+           restart
+       fi
+       ;;
+status)
+       status_clvm
+       ;;
+*)
+       echo $"Usage: $0 {start|stop|restart|condrestart|status}"
+       exit 1
+esac
+
+exit $?
+exit $RETVAL
author	Alasdair Kergon <agk@redhat.com>
	Thu, 24 Jun 2004 08:02:38 +0000 (08:02 +0000)
committer	Alasdair Kergon <agk@redhat.com>
	Thu, 24 Jun 2004 08:02:38 +0000 (08:02 +0000)
Makefile.in		patch \| blob \| blame \| history
VERSION		patch \| blob \| blame \| history
WHATS_NEW		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
configure.in		patch \| blob \| blame \| history
daemons/Makefile.in	[new file with mode: 0644]	patch \| blob
daemons/clvmd/Makefile.in	[new file with mode: 0644]	patch \| blob
daemons/clvmd/clvm.h	[new file with mode: 0644]	patch \| blob
daemons/clvmd/clvmd-cman.c	[new file with mode: 0644]	patch \| blob
daemons/clvmd/clvmd-command.c	[new file with mode: 0644]	patch \| blob
daemons/clvmd/clvmd-comms.h	[new file with mode: 0644]	patch \| blob
daemons/clvmd/clvmd-gulm.c	[new file with mode: 0644]	patch \| blob
daemons/clvmd/clvmd-gulm.h	[new file with mode: 0644]	patch \| blob
daemons/clvmd/clvmd.c	[new file with mode: 0644]	patch \| blob
daemons/clvmd/clvmd.h	[new file with mode: 0644]	patch \| blob
daemons/clvmd/cnxman-socket.h	[new file with mode: 0644]	patch \| blob
daemons/clvmd/libclvm.c	[new file with mode: 0644]	patch \| blob
daemons/clvmd/libclvm.h	[new file with mode: 0644]	patch \| blob
daemons/clvmd/lvm-functions.c	[new file with mode: 0644]	patch \| blob
daemons/clvmd/lvm-functions.h	[new file with mode: 0644]	patch \| blob
daemons/clvmd/system-lv.c	[new file with mode: 0644]	patch \| blob
daemons/clvmd/system-lv.h	[new file with mode: 0644]	patch \| blob
daemons/clvmd/tcp-comms.c	[new file with mode: 0644]	patch \| blob
daemons/clvmd/tcp-comms.h	[new file with mode: 0644]	patch \| blob
include/.symlinks		patch \| blob \| blame \| history
lib/Makefile.in		patch \| blob \| blame \| history
lib/locking/Makefile.in	[new file with mode: 0644]	patch \| blob
lib/locking/cluster_locking.c	[new file with mode: 0644]	patch \| blob
lib/locking/locking.c		patch \| blob \| blame \| history
lib/locking/locking_types.h		patch \| blob \| blame \| history
scripts/clvmd_fix_conf.sh	[new file with mode: 0644]	patch \| blob
scripts/clvmd_init	[new file with mode: 0755]	patch \| blob