cluster: STABLE3 - qdisk: Add reporting for I/O hangs to quourm disk
Lon Hohberger
lon@fedoraproject.org
Thu May 14 13:38:00 GMT 2009
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=6c4dea2b599fc9f461a7e1063f36a772c8e7d15f
Commit: 6c4dea2b599fc9f461a7e1063f36a772c8e7d15f
Parent: f0918fef3046b9362cfe9349a4c6589d1b96e3d0
Author: Lon Hohberger <lhh@redhat.com>
AuthorDate: Fri May 8 13:23:04 2009 -0400
Committer: Lon Hohberger <lhh@redhat.com>
CommitterDate: Thu May 14 09:37:52 2009 -0400
qdisk: Add reporting for I/O hangs to quourm disk
Signed-off-by: Lon Hohberger <lhh@redhat.com>
---
cman/qdisk/Makefile | 5 +-
cman/qdisk/disk.c | 21 +++++++-
cman/qdisk/iostate.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++
cman/qdisk/iostate.h | 17 ++++++
cman/qdisk/main.c | 7 +++
5 files changed, 188 insertions(+), 4 deletions(-)
diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
index 0b3629d..68e20cd 100644
--- a/cman/qdisk/Makefile
+++ b/cman/qdisk/Makefile
@@ -18,7 +18,7 @@ CFLAGS += -I$(S)
CFLAGS += -I${incdir}
LDFLAGS += -L${logtlibdir} -llogthread -lpthread
-LDFLAGS += -L${zliblibdir} -lz
+LDFLAGS += -L${zliblibdir} -lz -lrt
LDFLAGS += -L${libdir}
EXTRA_LDFLAGS += -L${cmanlibdir} -L${ccslibdir} -lcman -lccs
@@ -33,7 +33,8 @@ OBJS2= mkqdisk.o
SHAREDOBJS= disk.o \
disk_util.o \
proc.o \
- scandisk.o
+ scandisk.o \
+ iostate.o
${TARGET1}: ${SHAREDOBJS} ${OBJS1}
$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
diff --git a/cman/qdisk/disk.c b/cman/qdisk/disk.c
index e349698..680da2f 100644
--- a/cman/qdisk/disk.c
+++ b/cman/qdisk/disk.c
@@ -27,6 +27,7 @@
#include <linux/fs.h>
#include <liblogthread.h>
#include <zlib.h>
+#include "iostate.h"
static int diskRawRead(target_info_t *disk, char *buf, int len);
@@ -229,7 +230,9 @@ qdisk_open(char *name, target_info_t *disk)
disk->d_pagesz = sysconf(_SC_PAGESIZE);
/* Check to verify that the partition is large enough.*/
+ io_state(STATE_LSEEK);
ret = lseek(disk->d_fd, END_OF_DISK(disk->d_blksz), SEEK_SET);
+ io_state(STATE_NONE);
if (ret < 0) {
logt_print(LOG_DEBUG, "open_partition: seek");
close(disk->d_fd);
@@ -332,7 +335,9 @@ diskRawReadShadow(target_info_t *disk, off_t readOffset, char *buf, int len)
shared_header_t *hdrp;
char *data;
+ io_state(STATE_LSEEK);
ret = lseek(disk->d_fd, readOffset, SEEK_SET);
+ io_state(STATE_NONE);
if (ret != readOffset) {
logt_print(LOG_DEBUG,
"diskRawReadShadow: can't seek to offset %d.\n",
@@ -391,7 +396,10 @@ diskRawRead(target_info_t *disk, char *buf, int len)
if (bounceNeeded == 0) {
/* Already aligned and even multiple of 512, no bounceio
* required. */
- return (read(disk->d_fd, buf, len));
+ io_state(STATE_READ);
+ readret = read(disk->d_fd, buf, len);
+ io_state(STATE_NONE);
+ return readret;
}
if (len > disk->d_blksz) {
@@ -420,7 +428,9 @@ diskRawRead(target_info_t *disk, char *buf, int len)
return -1;
}
+ io_state(STATE_READ);
readret = read(disk->d_fd, alignedBuf, readlen);
+ io_state(STATE_NONE);
if (readret > 0) {
if (readret > len) {
memcpy(alignedBuf, buf, len);
@@ -463,7 +473,10 @@ diskRawWrite(target_info_t *disk, char *buf, int len)
if (bounceNeeded == 0) {
/* Already aligned and even multiple of 512, no bounceio
* required. */
- return (write(disk->d_fd, buf, len));
+ io_state(STATE_WRITE);
+ ret = write(disk->d_fd, buf, len);
+ io_state(STATE_NONE);
+ return ret;
}
if (len > disk->d_blksz) {
@@ -500,7 +513,9 @@ diskRawWrite(target_info_t *disk, char *buf, int len)
}
memcpy(buf, alignedBuf, len);
+ io_state(STATE_WRITE);
ret = write(disk->d_fd, alignedBuf, writelen);
+ io_state(STATE_NONE);
if (ret > len) {
ret = len;
}
@@ -528,7 +543,9 @@ diskRawWriteShadow(target_info_t *disk, __off64_t writeOffset, char *buf, int le
return (-1);
}
+ io_state(STATE_LSEEK);
retval_seek = lseek(disk->d_fd, writeOffset, SEEK_SET);
+ io_state(STATE_NONE);
if (retval_seek != writeOffset) {
logt_print(LOG_ERR,
"diskRawWriteShadow: can't seek to offset %d\n",
diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
new file mode 100644
index 0000000..f195c45
--- /dev/null
+++ b/cman/qdisk/iostate.c
@@ -0,0 +1,142 @@
+#include <pthread.h>
+#include <iostate.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/time.h>
+#include <liblogthread.h>
+#include "iostate.h"
+
+static iostate_t main_state = 0;
+static int main_incarnation = 0;
+static int qdisk_timeout = 0, sleeptime = 0;
+static int thread_active = 0;
+static pthread_t io_nanny_tid = 0;
+static pthread_mutex_t state_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t state_cond = PTHREAD_COND_INITIALIZER;
+
+struct state_table {
+ iostate_t state;
+ const char *value;
+};
+
+static struct state_table io_state_table[] = {
+{ STATE_NONE, "none" },
+{ STATE_WRITE, "write" },
+{ STATE_READ, "read" },
+{ STATE_LSEEK, "seek" },
+{ -1, NULL } };
+
+static const char *
+state_to_string(iostate_t state)
+{
+ static const char *ret = "unknown";
+ int i;
+
+ for (i=0; io_state_table[i].value; i++) {
+ if (io_state_table[i].state == state) {
+ ret = io_state_table[i].value;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+
+void
+io_state(iostate_t state)
+{
+ pthread_mutex_lock(&state_mutex);
+ main_state = state;
+ main_incarnation++; /* it does not matter if this wraps. */
+ pthread_mutex_unlock(&state_mutex);
+
+ /* Optimization: Don't signal on STATE_NONE */
+ if (state != STATE_NONE)
+ pthread_cond_broadcast(&state_cond);
+}
+
+
+static void *
+io_nanny_thread(void *arg)
+{
+ struct timespec wait_time;
+ iostate_t last_main_state = 0, current_main_state = 0;
+ int last_main_incarnation = 0, current_main_incarnation = 0;
+ int logged_incarnation = 0;
+
+ /* Start with wherever we're at now */
+ pthread_mutex_lock(&state_mutex);
+ current_main_state = last_main_state = main_state;
+ current_main_incarnation = last_main_incarnation = main_incarnation;
+ pthread_mutex_unlock(&state_mutex);
+
+ while (thread_active) {
+ pthread_mutex_lock(&state_mutex);
+ clock_gettime(CLOCK_REALTIME, &wait_time);
+ wait_time.tv_sec += sleeptime;
+ pthread_cond_timedwait(&state_cond, &state_mutex, &wait_time);
+ current_main_state = main_state;
+ current_main_incarnation = main_incarnation;
+ pthread_mutex_unlock(&state_mutex);
+
+ if (!thread_active)
+ break;
+
+ if (!current_main_state)
+ continue;
+
+ /* if the state or incarnation changed, the main qdiskd
+ * thread is healthy */
+ if (current_main_state != last_main_state ||
+ current_main_incarnation != last_main_incarnation) {
+ last_main_state = current_main_state;
+ last_main_incarnation = current_main_incarnation;
+ continue;
+ }
+
+ /* Don't log things twice */
+ if (logged_incarnation == current_main_incarnation)
+ continue;
+ logged_incarnation = current_main_incarnation;
+
+ logt_print(LOG_WARNING, "qdiskd: %s "
+ "(system call) has hung for %d seconds\n",
+ state_to_string(current_main_state), sleeptime);
+ logt_print(LOG_WARNING,
+ "In %d more seconds, we will be evicted\n",
+ (qdisk_timeout-sleeptime));
+ }
+
+ return NULL;
+}
+
+
+int
+io_nanny_start(int timeout)
+{
+ int ret;
+
+ pthread_mutex_lock(&state_mutex);
+
+ sleeptime = timeout / 2;
+ qdisk_timeout = timeout;
+ thread_active = 1;
+
+ ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
+ pthread_mutex_unlock(&state_mutex);
+
+ return ret;
+}
+
+
+int
+io_nanny_stop(void)
+{
+ thread_active = 0;
+ pthread_cond_broadcast(&state_cond);
+ pthread_join(io_nanny_tid, NULL);
+ io_nanny_tid = 0;
+
+ return 0;
+}
diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
new file mode 100644
index 0000000..7dd7bf6
--- /dev/null
+++ b/cman/qdisk/iostate.h
@@ -0,0 +1,17 @@
+#ifndef _IOSTATE_H
+#define _IOSTATE_H
+
+typedef enum {
+ STATE_NONE = 0,
+ STATE_READ = 1,
+ STATE_WRITE = 2,
+ STATE_LSEEK = 3,
+ STATE_UNKNOWN = 4
+} iostate_t;
+
+void io_state(iostate_t state);
+
+int io_nanny_start(int timeout);
+int io_nanny_stop(void);
+
+#endif
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index d1865cd..a6be5a8 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -28,6 +28,8 @@
#define LOG_DAEMON_NAME "qdiskd"
#define LOG_MODE_DEFAULT LOG_MODE_OUTPUT_SYSLOG|LOG_MODE_OUTPUT_FILE
+#include "iostate.h"
+
/* from main.c */
void set_priority(int queue, int prio);
@@ -1793,9 +1795,14 @@ main(int argc, char **argv)
goto out;
}
*/
+
+ io_nanny_start(ctx.qc_tko * ctx.qc_interval);
+
if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0)
cman_unregister_quorum_device(ctx.qc_cman_admin);
+ io_nanny_stop();
+
quorum_logout(&ctx);
out:
/* free cman handle to avoid leak in cman */
More information about the Cluster-cvs
mailing list