cluster: RHEL5 - qdisk: Add reporting for I/O hangs to quourm disk

Lon Hohberger lon@fedoraproject.org
Wed May 13 15:14:00 GMT 2009


Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=83a61282601bff7dd26e8bcf4ebd4b1f38d6e25c
Commit:        83a61282601bff7dd26e8bcf4ebd4b1f38d6e25c
Parent:        bb1e50295e8dcf36d7ce9ad22196fd7f89fba899
Author:        Lon Hohberger <lhh@redhat.com>
AuthorDate:    Fri May 8 13:23:04 2009 -0400
Committer:     Lon Hohberger <lhh@redhat.com>
CommitterDate: Wed May 13 11:13:56 2009 -0400

qdisk: Add reporting for I/O hangs to quourm disk

Signed-off-by: Lon Hohberger <lhh@redhat.com>
---
 cman/qdisk/Makefile  |   10 ++--
 cman/qdisk/disk.c    |   21 +++++++-
 cman/qdisk/iostate.c |  142 ++++++++++++++++++++++++++++++++++++++++++++++++++
 cman/qdisk/iostate.h |   17 ++++++
 cman/qdisk/main.c    |    6 ++
 5 files changed, 189 insertions(+), 7 deletions(-)

diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
index 23d0890..f58806b 100644
--- a/cman/qdisk/Makefile
+++ b/cman/qdisk/Makefile
@@ -28,12 +28,12 @@ install: all
 	install ${TARGET} ${sbindir}
 
 qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \
-	gettid.o proc.o daemon_init.o scandisk.o ../lib/libcman.a
-	gcc -o $@ $^ -lpthread -L../lib -L${ccslibdir} -lccs
+	gettid.o proc.o daemon_init.o scandisk.o iostate.o ../lib/libcman.a
+	gcc -o $@ $^ -lpthread -L../lib -L${ccslibdir} -lccs -lrt
 
-mkqdisk: disk.o crc32.o disk_util.o \
-	 proc.o mkqdisk.o scandisk.o
-	gcc -o $@ $^ 
+mkqdisk: disk.o crc32.o disk_util.o iostate.o \
+	 proc.o mkqdisk.o scandisk.o clulog.o gettid.o
+	gcc -o $@ $^ -lrt
 
 %.o: %.c
 	$(CC) -c -o $@ $^ $(INCLUDES) $(CFLAGS)
diff --git a/cman/qdisk/disk.c b/cman/qdisk/disk.c
index 8cf7b5a..6771e06 100644
--- a/cman/qdisk/disk.c
+++ b/cman/qdisk/disk.c
@@ -44,6 +44,7 @@
 #include <unistd.h>
 #include <time.h>
 #include <linux/fs.h>
+#include "iostate.h"
 
 static int diskRawRead(target_info_t *disk, char *buf, int len);
 uint32_t clu_crc32(const char *data, size_t count);
@@ -236,7 +237,9 @@ qdisk_open(char *name, target_info_t *disk)
 	disk->d_pagesz = sysconf(_SC_PAGESIZE);
 
 	/* Check to verify that the partition is large enough.*/
+	io_state(STATE_LSEEK);
 	ret = lseek(disk->d_fd, END_OF_DISK(disk->d_blksz), SEEK_SET);
+	io_state(STATE_NONE);
 	if (ret < 0) {
 		perror("open_partition: seek");
 		close(disk->d_fd);
@@ -340,7 +343,9 @@ diskRawReadShadow(target_info_t *disk, off_t readOffset, char *buf, int len)
 	char *data;
 	int datalen;
 
+	io_state(STATE_LSEEK);
 	ret = lseek(disk->d_fd, readOffset, SEEK_SET);
+	io_state(STATE_NONE);
 	if (ret != readOffset) {
 #if 0
 		fprintf(stderr,
@@ -405,7 +410,10 @@ diskRawRead(target_info_t *disk, char *buf, int len)
 	if (bounceNeeded == 0) {
 		/* Already aligned and even multiple of 512, no bounceio
 		 * required. */
-		return (read(disk->d_fd, buf, len));
+		io_state(STATE_READ);
+		readret = read(disk->d_fd, buf, len);
+		io_state(STATE_NONE);
+		return readret;
 	}
 
 	if (len > disk->d_blksz) {
@@ -434,7 +442,9 @@ diskRawRead(target_info_t *disk, char *buf, int len)
 		return -1;
 	}
 
+	io_state(STATE_READ);
 	readret = read(disk->d_fd, alignedBuf, readlen);
+	io_state(STATE_NONE);
 	if (readret > 0) {
 		if (readret > len) {
 			memcpy(alignedBuf, buf, len);
@@ -477,7 +487,10 @@ diskRawWrite(target_info_t *disk, char *buf, int len)
 	if (bounceNeeded == 0) {
 		/* Already aligned and even multiple of 512, no bounceio
 		 * required. */
-		return (write(disk->d_fd, buf, len));
+		io_state(STATE_WRITE);
+		ret = write(disk->d_fd, buf, len);
+		io_state(STATE_NONE);
+		return ret;
 	}
 
 	if (len > disk->d_blksz) {
@@ -514,7 +527,9 @@ diskRawWrite(target_info_t *disk, char *buf, int len)
 	}
 
 	memcpy(buf, alignedBuf, len);
+	io_state(STATE_WRITE);
 	ret = write(disk->d_fd, alignedBuf, writelen);
+	io_state(STATE_NONE);
 	if (ret > len) {
 		ret = len;
 	}
@@ -542,7 +557,9 @@ diskRawWriteShadow(target_info_t *disk, __off64_t writeOffset, char *buf, int le
 		return (-1);
 	}
 
+	io_state(STATE_LSEEK);
 	retval_seek = lseek(disk->d_fd, writeOffset, SEEK_SET);
+	io_state(STATE_NONE);
 	if (retval_seek != writeOffset) {
 		fprintf(stderr,
 		       "diskRawWriteShadow: can't seek to offset %d\n",
diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
new file mode 100644
index 0000000..f4f2329
--- /dev/null
+++ b/cman/qdisk/iostate.c
@@ -0,0 +1,142 @@
+#include <pthread.h>
+#include <iostate.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/time.h>
+#include <clulog.h>
+#include "iostate.h"
+
+static iostate_t main_state = 0;
+static int main_incarnation = 0;
+static int qdisk_timeout = 0, sleeptime = 0;
+static int thread_active = 0;
+static pthread_t io_nanny_tid = 0;
+static pthread_mutex_t state_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t state_cond = PTHREAD_COND_INITIALIZER;
+
+struct state_table {
+	iostate_t state;
+	const char *value;
+};
+
+static struct state_table io_state_table[] = {
+{	STATE_NONE,	"none"	},
+{	STATE_WRITE,	"write"	},
+{	STATE_READ,	"read"	},
+{	STATE_LSEEK,	"seek"	},
+{	-1,		NULL	} };
+
+static const char *
+state_to_string(iostate_t state)
+{
+	static const char *ret = "unknown";
+	int i;
+
+	for (i=0; io_state_table[i].value; i++) {
+		if (io_state_table[i].state == state) {
+			ret = io_state_table[i].value;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
+void
+io_state(iostate_t state)
+{
+	pthread_mutex_lock(&state_mutex);
+	main_state = state;
+	main_incarnation++; /* it does not matter if this wraps. */
+	pthread_mutex_unlock(&state_mutex);
+
+	/* Optimization: Don't signal on STATE_NONE */
+	if (state != STATE_NONE)
+		pthread_cond_broadcast(&state_cond);
+}
+
+
+static void *
+io_nanny_thread(void *arg)
+{
+	struct timespec wait_time;
+	iostate_t last_main_state = 0, current_main_state = 0;
+	int last_main_incarnation = 0, current_main_incarnation = 0;
+	int logged_incarnation = 0;
+
+	/* Start with wherever we're at now */
+	pthread_mutex_lock(&state_mutex);
+	current_main_state = last_main_state = main_state;
+	current_main_incarnation = last_main_incarnation = main_incarnation;
+	pthread_mutex_unlock(&state_mutex);
+
+	while (thread_active) {
+		pthread_mutex_lock(&state_mutex);
+    		clock_gettime(CLOCK_REALTIME, &wait_time);
+		wait_time.tv_sec += sleeptime;
+		pthread_cond_timedwait(&state_cond, &state_mutex, &wait_time);
+		current_main_state = main_state;
+		current_main_incarnation = main_incarnation;
+		pthread_mutex_unlock(&state_mutex);
+
+		if (!thread_active)
+			break;
+
+		if (!current_main_state)
+			continue;
+
+		/* if the state or incarnation changed, the main qdiskd
+		 * thread is healthy */
+		if (current_main_state != last_main_state ||
+		    current_main_incarnation != last_main_incarnation) {
+			last_main_state = current_main_state;
+			last_main_incarnation = current_main_incarnation;
+			continue;
+		}
+
+		/* Don't log things twice */
+		if (logged_incarnation == current_main_incarnation)
+			continue;
+		logged_incarnation = current_main_incarnation;
+
+		clulog(LOG_WARNING, "qdiskd: %s "
+			   "(system call) has hung for %d seconds\n",
+			   state_to_string(current_main_state), sleeptime);
+		clulog(LOG_WARNING,
+			   "In %d more seconds, we will be evicted\n",
+			   (qdisk_timeout-sleeptime));
+	}
+
+	return NULL;
+}
+
+
+int
+io_nanny_start(int timeout)
+{
+	int ret;
+
+	pthread_mutex_lock(&state_mutex);
+
+	sleeptime = timeout / 2;
+	qdisk_timeout = timeout;
+	thread_active = 1;
+
+	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
+	pthread_mutex_unlock(&state_mutex);
+
+	return ret;
+}
+
+
+int
+io_nanny_stop(void)
+{
+	thread_active = 0;
+	pthread_cond_broadcast(&state_cond);
+	pthread_join(io_nanny_tid, NULL);
+	io_nanny_tid = 0;
+
+	return 0;
+}
diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
new file mode 100644
index 0000000..7dd7bf6
--- /dev/null
+++ b/cman/qdisk/iostate.h
@@ -0,0 +1,17 @@
+#ifndef _IOSTATE_H
+#define _IOSTATE_H
+
+typedef enum {
+	STATE_NONE	= 0,
+	STATE_READ	= 1,
+	STATE_WRITE	= 2,
+	STATE_LSEEK	= 3,
+	STATE_UNKNOWN	= 4
+} iostate_t;
+
+void io_state(iostate_t state);
+
+int io_nanny_start(int timeout);
+int io_nanny_stop(void);
+
+#endif
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index e235883..090c71e 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -43,6 +43,7 @@
 #include <ccs.h>
 #include "score.h"
 #include "clulog.h"
+#include "iostate.h"
 #if (!defined(LIBCMAN_VERSION) || \
      (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION < 2))
 #include <cluster/cnxman-socket.h>
@@ -1592,9 +1593,14 @@ main(int argc, char **argv)
 		goto out;
 	}
 	*/
+
+	io_nanny_start(ctx.qc_tko * ctx.qc_interval);
+
 	if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0)
 		cman_unregister_quorum_device(ctx.qc_ch);
 
+	io_nanny_stop();
+
 	quorum_logout(&ctx);
 	/* free cman handle to avoid leak in cman */
 out:



More information about the Cluster-cvs mailing list