cluster: STABLE3 - qdisk: Add reporting for I/O hangs to quourm disk

Lon Hohberger lon@fedoraproject.org
Thu May 14 13:38:00 GMT 2009


Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=6c4dea2b599fc9f461a7e1063f36a772c8e7d15f
Commit:        6c4dea2b599fc9f461a7e1063f36a772c8e7d15f
Parent:        f0918fef3046b9362cfe9349a4c6589d1b96e3d0
Author:        Lon Hohberger <lhh@redhat.com>
AuthorDate:    Fri May 8 13:23:04 2009 -0400
Committer:     Lon Hohberger <lhh@redhat.com>
CommitterDate: Thu May 14 09:37:52 2009 -0400

qdisk: Add reporting for I/O hangs to quourm disk

Signed-off-by: Lon Hohberger <lhh@redhat.com>
---
 cman/qdisk/Makefile  |    5 +-
 cman/qdisk/disk.c    |   21 +++++++-
 cman/qdisk/iostate.c |  142 ++++++++++++++++++++++++++++++++++++++++++++++++++
 cman/qdisk/iostate.h |   17 ++++++
 cman/qdisk/main.c    |    7 +++
 5 files changed, 188 insertions(+), 4 deletions(-)

diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
index 0b3629d..68e20cd 100644
--- a/cman/qdisk/Makefile
+++ b/cman/qdisk/Makefile
@@ -18,7 +18,7 @@ CFLAGS += -I$(S)
 CFLAGS += -I${incdir}
 
 LDFLAGS += -L${logtlibdir} -llogthread -lpthread
-LDFLAGS += -L${zliblibdir} -lz
+LDFLAGS += -L${zliblibdir} -lz -lrt
 LDFLAGS += -L${libdir}
 
 EXTRA_LDFLAGS += -L${cmanlibdir} -L${ccslibdir} -lcman -lccs
@@ -33,7 +33,8 @@ OBJS2=	mkqdisk.o
 SHAREDOBJS= disk.o \
 	    disk_util.o \
 	    proc.o \
-	    scandisk.o
+	    scandisk.o \
+	    iostate.o
 
 ${TARGET1}: ${SHAREDOBJS} ${OBJS1}
 	$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
diff --git a/cman/qdisk/disk.c b/cman/qdisk/disk.c
index e349698..680da2f 100644
--- a/cman/qdisk/disk.c
+++ b/cman/qdisk/disk.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <liblogthread.h>
 #include <zlib.h>
+#include "iostate.h"
 
 static int diskRawRead(target_info_t *disk, char *buf, int len);
 
@@ -229,7 +230,9 @@ qdisk_open(char *name, target_info_t *disk)
 	disk->d_pagesz = sysconf(_SC_PAGESIZE);
 
 	/* Check to verify that the partition is large enough.*/
+	io_state(STATE_LSEEK);
 	ret = lseek(disk->d_fd, END_OF_DISK(disk->d_blksz), SEEK_SET);
+	io_state(STATE_NONE);
 	if (ret < 0) {
 		logt_print(LOG_DEBUG, "open_partition: seek");
 		close(disk->d_fd);
@@ -332,7 +335,9 @@ diskRawReadShadow(target_info_t *disk, off_t readOffset, char *buf, int len)
 	shared_header_t *hdrp;
 	char *data;
 
+	io_state(STATE_LSEEK);
 	ret = lseek(disk->d_fd, readOffset, SEEK_SET);
+	io_state(STATE_NONE);
 	if (ret != readOffset) {
 		logt_print(LOG_DEBUG,
 		       "diskRawReadShadow: can't seek to offset %d.\n",
@@ -391,7 +396,10 @@ diskRawRead(target_info_t *disk, char *buf, int len)
 	if (bounceNeeded == 0) {
 		/* Already aligned and even multiple of 512, no bounceio
 		 * required. */
-		return (read(disk->d_fd, buf, len));
+		io_state(STATE_READ);
+		readret = read(disk->d_fd, buf, len);
+		io_state(STATE_NONE);
+		return readret;
 	}
 
 	if (len > disk->d_blksz) {
@@ -420,7 +428,9 @@ diskRawRead(target_info_t *disk, char *buf, int len)
 		return -1;
 	}
 
+	io_state(STATE_READ);
 	readret = read(disk->d_fd, alignedBuf, readlen);
+	io_state(STATE_NONE);
 	if (readret > 0) {
 		if (readret > len) {
 			memcpy(alignedBuf, buf, len);
@@ -463,7 +473,10 @@ diskRawWrite(target_info_t *disk, char *buf, int len)
 	if (bounceNeeded == 0) {
 		/* Already aligned and even multiple of 512, no bounceio
 		 * required. */
-		return (write(disk->d_fd, buf, len));
+		io_state(STATE_WRITE);
+		ret = write(disk->d_fd, buf, len);
+		io_state(STATE_NONE);
+		return ret;
 	}
 
 	if (len > disk->d_blksz) {
@@ -500,7 +513,9 @@ diskRawWrite(target_info_t *disk, char *buf, int len)
 	}
 
 	memcpy(buf, alignedBuf, len);
+	io_state(STATE_WRITE);
 	ret = write(disk->d_fd, alignedBuf, writelen);
+	io_state(STATE_NONE);
 	if (ret > len) {
 		ret = len;
 	}
@@ -528,7 +543,9 @@ diskRawWriteShadow(target_info_t *disk, __off64_t writeOffset, char *buf, int le
 		return (-1);
 	}
 
+	io_state(STATE_LSEEK);
 	retval_seek = lseek(disk->d_fd, writeOffset, SEEK_SET);
+	io_state(STATE_NONE);
 	if (retval_seek != writeOffset) {
 		logt_print(LOG_ERR,
 		       "diskRawWriteShadow: can't seek to offset %d\n",
diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
new file mode 100644
index 0000000..f195c45
--- /dev/null
+++ b/cman/qdisk/iostate.c
@@ -0,0 +1,142 @@
+#include <pthread.h>
+#include <iostate.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/time.h>
+#include <liblogthread.h>
+#include "iostate.h"
+
+static iostate_t main_state = 0;
+static int main_incarnation = 0;
+static int qdisk_timeout = 0, sleeptime = 0;
+static int thread_active = 0;
+static pthread_t io_nanny_tid = 0;
+static pthread_mutex_t state_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t state_cond = PTHREAD_COND_INITIALIZER;
+
+struct state_table {
+	iostate_t state;
+	const char *value;
+};
+
+static struct state_table io_state_table[] = {
+{	STATE_NONE,	"none"	},
+{	STATE_WRITE,	"write"	},
+{	STATE_READ,	"read"	},
+{	STATE_LSEEK,	"seek"	},
+{	-1,		NULL	} };
+
+static const char *
+state_to_string(iostate_t state)
+{
+	static const char *ret = "unknown";
+	int i;
+
+	for (i=0; io_state_table[i].value; i++) {
+		if (io_state_table[i].state == state) {
+			ret = io_state_table[i].value;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
+void
+io_state(iostate_t state)
+{
+	pthread_mutex_lock(&state_mutex);
+	main_state = state;
+	main_incarnation++; /* it does not matter if this wraps. */
+	pthread_mutex_unlock(&state_mutex);
+
+	/* Optimization: Don't signal on STATE_NONE */
+	if (state != STATE_NONE)
+		pthread_cond_broadcast(&state_cond);
+}
+
+
+static void *
+io_nanny_thread(void *arg)
+{
+	struct timespec wait_time;
+	iostate_t last_main_state = 0, current_main_state = 0;
+	int last_main_incarnation = 0, current_main_incarnation = 0;
+	int logged_incarnation = 0;
+
+	/* Start with wherever we're at now */
+	pthread_mutex_lock(&state_mutex);
+	current_main_state = last_main_state = main_state;
+	current_main_incarnation = last_main_incarnation = main_incarnation;
+	pthread_mutex_unlock(&state_mutex);
+
+	while (thread_active) {
+		pthread_mutex_lock(&state_mutex);
+    		clock_gettime(CLOCK_REALTIME, &wait_time);
+		wait_time.tv_sec += sleeptime;
+		pthread_cond_timedwait(&state_cond, &state_mutex, &wait_time);
+		current_main_state = main_state;
+		current_main_incarnation = main_incarnation;
+		pthread_mutex_unlock(&state_mutex);
+
+		if (!thread_active)
+			break;
+
+		if (!current_main_state)
+			continue;
+
+		/* if the state or incarnation changed, the main qdiskd
+		 * thread is healthy */
+		if (current_main_state != last_main_state ||
+		    current_main_incarnation != last_main_incarnation) {
+			last_main_state = current_main_state;
+			last_main_incarnation = current_main_incarnation;
+			continue;
+		}
+
+		/* Don't log things twice */
+		if (logged_incarnation == current_main_incarnation)
+			continue;
+		logged_incarnation = current_main_incarnation;
+
+		logt_print(LOG_WARNING, "qdiskd: %s "
+			   "(system call) has hung for %d seconds\n",
+			   state_to_string(current_main_state), sleeptime);
+		logt_print(LOG_WARNING,
+			   "In %d more seconds, we will be evicted\n",
+			   (qdisk_timeout-sleeptime));
+	}
+
+	return NULL;
+}
+
+
+int
+io_nanny_start(int timeout)
+{
+	int ret;
+
+	pthread_mutex_lock(&state_mutex);
+
+	sleeptime = timeout / 2;
+	qdisk_timeout = timeout;
+	thread_active = 1;
+
+	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
+	pthread_mutex_unlock(&state_mutex);
+
+	return ret;
+}
+
+
+int
+io_nanny_stop(void)
+{
+	thread_active = 0;
+	pthread_cond_broadcast(&state_cond);
+	pthread_join(io_nanny_tid, NULL);
+	io_nanny_tid = 0;
+
+	return 0;
+}
diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
new file mode 100644
index 0000000..7dd7bf6
--- /dev/null
+++ b/cman/qdisk/iostate.h
@@ -0,0 +1,17 @@
+#ifndef _IOSTATE_H
+#define _IOSTATE_H
+
+typedef enum {
+	STATE_NONE	= 0,
+	STATE_READ	= 1,
+	STATE_WRITE	= 2,
+	STATE_LSEEK	= 3,
+	STATE_UNKNOWN	= 4
+} iostate_t;
+
+void io_state(iostate_t state);
+
+int io_nanny_start(int timeout);
+int io_nanny_stop(void);
+
+#endif
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index d1865cd..a6be5a8 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -28,6 +28,8 @@
 
 #define LOG_DAEMON_NAME  "qdiskd"
 #define LOG_MODE_DEFAULT LOG_MODE_OUTPUT_SYSLOG|LOG_MODE_OUTPUT_FILE
+#include "iostate.h"
+
 
 /* from main.c */
 void set_priority(int queue, int prio);
@@ -1793,9 +1795,14 @@ main(int argc, char **argv)
 		goto out;
 	}
 	*/
+
+	io_nanny_start(ctx.qc_tko * ctx.qc_interval);
+
 	if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0)
 		cman_unregister_quorum_device(ctx.qc_cman_admin);
 
+	io_nanny_stop();
+
 	quorum_logout(&ctx);
 out:
 	/* free cman handle to avoid leak in cman */



More information about the Cluster-cvs mailing list