static SaVersionT version = { 'B', 1, 1 };
#define DEBUGGING_HISTORY 100
-static char debugging[DEBUGGING_HISTORY][128];
-static int idx = 0;
-#define LOG_SPRINT(f, arg...) do {\
- idx++; \
- idx = idx % DEBUGGING_HISTORY; \
- sprintf(debugging[idx], f, ## arg); \
+//static char debugging[DEBUGGING_HISTORY][128];
+//static int idx = 0;
+#define LOG_SPRINT(cc, f, arg...) do { \
+ cc->idx++; \
+ cc->idx = cc->idx % DEBUGGING_HISTORY; \
+ sprintf(cc->debugging[cc->idx], f, ## arg); \
} while (0)
static int log_resp_rec = 0;
int checkpoints_needed;
uint32_t checkpoint_requesters[MAX_CHECKPOINT_REQUESTERS];
struct checkpoint_data *checkpoint_list;
+ int idx;
+ char debugging[DEBUGGING_HISTORY][128];
};
/* FIXME: Need lock for this */
*/
switch (export_checkpoint(cp)) {
case -EEXIST:
- LOG_SPRINT("[%s] Checkpoint for %u already handled%s",
+ LOG_SPRINT(entry, "[%s] Checkpoint for %u already handled%s",
SHORT_UUID(entry->name.value), cp->requester,
(leaving) ? "(L)": "");
LOG_COND(log_checkpoint,
cp = entry->checkpoint_list;
break;
case 0:
- LOG_SPRINT("[%s] Checkpoint data available for node %u%s",
+ LOG_SPRINT(entry, "[%s] Checkpoint data available for node %u%s",
SHORT_UUID(entry->name.value), cp->requester,
(leaving) ? "(L)": "");
LOG_COND(log_checkpoint,
"[%s] Skipping resend of %s/#%u...",
SHORT_UUID(entry->name.value),
_RQ_TYPE(tfr->request_type), tfr->seq);
- LOG_SPRINT("### No resend: [%s] %s/%u ###",
+ LOG_SPRINT(entry, "### No resend: [%s] %s/%u ###",
SHORT_UUID(entry->name.value),
_RQ_TYPE(tfr->request_type), tfr->seq);
SHORT_UUID(entry->name.value),
_RQ_TYPE(tfr->request_type),
tfr->seq, entry->lowest_id);
- LOG_SPRINT("*** Resending: [%s] %s/%u ***",
+ LOG_SPRINT(entry, "*** Resending: [%s] %s/%u ***",
SHORT_UUID(entry->name.value),
_RQ_TYPE(tfr->request_type), tfr->seq);
r = cluster_send(tfr);
free(tfr);
continue;
}
- LOG_SPRINT("[%s] Checkpoint prepared for %u",
+ LOG_SPRINT(entry, "[%s] Checkpoint prepared for %u",
SHORT_UUID(entry->name.value), tfr->originator);
LOG_COND(log_checkpoint, "[%s] Checkpoint prepared for %u",
SHORT_UUID(entry->name.value), tfr->originator);
if (tfr->request_type == DM_CLOG_CHECKPOINT_READY) {
if (my_cluster_id == tfr->originator) {
/* Redundant checkpoints ignored if match->valid */
- LOG_SPRINT("[%s] CHECKPOINT_READY notification from %u",
+ LOG_SPRINT(match, "[%s] CHECKPOINT_READY notification from %u",
SHORT_UUID(tfr->uuid), nodeid);
if (import_checkpoint(match, (match->state != INVALID))) {
- LOG_SPRINT("[%s] Failed to import checkpoint from %u",
+ LOG_SPRINT(match,
+ "[%s] Failed to import checkpoint from %u",
SHORT_UUID(tfr->uuid), nodeid);
LOG_ERROR("[%s] Failed to import checkpoint from %u",
SHORT_UUID(tfr->uuid), nodeid);
/* Could we retry? */
goto out;
} else if (match->state == INVALID) {
- LOG_SPRINT("[%s] Checkpoint data received from %u. Log is now valid",
+ LOG_SPRINT(match,
+ "[%s] Checkpoint data received from %u. Log is now valid",
SHORT_UUID(match->name.value), nodeid);
LOG_COND(log_checkpoint,
"[%s] Checkpoint data received from %u. Log is now valid",
flush_startup_list(match);
} else {
- LOG_SPRINT("[%s] Redundant checkpoint from %u ignored.",
+ LOG_SPRINT(match,
+ "[%s] Redundant checkpoint from %u ignored.",
SHORT_UUID(tfr->uuid), nodeid);
}
}
SHORT_UUID(tfr->uuid), match->checkpoint_requesters[i]);
break;
}
- LOG_SPRINT("[%s] Checkpoint prepared for %u* (%s)",
+ LOG_SPRINT(match, "[%s] Checkpoint prepared for %u* (%s)",
SHORT_UUID(tfr->uuid), match->checkpoint_requesters[i],
(log_get_state(tfr) != LOG_RESUMED)? "LOG_RESUMED": "LOG_SUSPENDED");
LOG_COND(log_checkpoint, "[%s] Checkpoint prepared for %u*",
LOG_ERROR("HISTORY::");
for (i = 0; i < DEBUGGING_HISTORY; i++) {
- idx++;
- idx = idx % DEBUGGING_HISTORY;
- if (debugging[idx][0] == '\0')
+ match->idx++;
+ match->idx = match->idx % DEBUGGING_HISTORY;
+ if (match->debugging[match->idx][0] == '\0')
continue;
- LOG_ERROR("%d:%d) %s", i, idx, debugging[idx]);
+ LOG_ERROR("%d:%d) %s", i, match->idx,
+ match->debugging[match->idx]);
}
} else if (!(tfr->request_type & DM_CLOG_RESPONSE) ||
(tfr->originator == my_cluster_id)) {
if (!response)
- LOG_SPRINT("SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s",
+ LOG_SPRINT(match, "SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s",
tfr->seq, SHORT_UUID(tfr->uuid),
_RQ_TYPE(tfr->request_type),
tfr->originator, (response) ? "YES" : "NO");
else
- LOG_SPRINT("SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s, RSPR=%u",
+ LOG_SPRINT(match, "SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s, RSPR=%u",
tfr->seq, SHORT_UUID(tfr->uuid),
_RQ_TYPE(tfr->request_type),
tfr->originator, (response) ? "YES" : "NO",
LOG_COND(log_membership_change, "[%s] Server unchanged at %u (%u joined)",
SHORT_UUID(match->name.value),
lowest, joined->nodeid);
- LOG_SPRINT("+++ UUID=%s %u join +++",
+ LOG_SPRINT(match, "+++ UUID=%s %u join +++",
SHORT_UUID(match->name.value), joined->nodeid);
}
struct clog_tfr *tfr;
struct checkpoint_data *p_cp, *c_cp;
- LOG_SPRINT("--- UUID=%s %u left ---",
+ LOG_SPRINT(match, "--- UUID=%s %u left ---",
SHORT_UUID(match->name.value), left->nodeid);
/* Am I leaving? */
int i;
SaAisErrorT rv;
- for (i = 0; i < DEBUGGING_HISTORY; i++)
- debugging[i][0] = '\0';
+// for (i = 0; i < DEBUGGING_HISTORY; i++)
+// debugging[i][0] = '\0';
INIT_LIST_HEAD(&clog_cpg_list);
rv = saCkptInitialize(&ckpt_handle, &callbacks, &version);
t = (struct clog_tfr *)p;
LOG_ERROR(" %s/%u", _RQ_TYPE(t->request_type), t->seq);
}
- }
- LOG_ERROR("Command History:");
- for (i = 0; i < DEBUGGING_HISTORY; i++) {
- idx++;
- idx = idx % DEBUGGING_HISTORY;
- if (debugging[idx][0] == '\0')
- continue;
- LOG_ERROR("%d:%d) %s", i, idx, debugging[idx]);
+ LOG_ERROR("Command History:");
+ for (i = 0; i < DEBUGGING_HISTORY; i++) {
+ entry->idx++;
+ entry->idx = entry->idx % DEBUGGING_HISTORY;
+ if (entry->debugging[entry->idx][0] == '\0')
+ continue;
+ LOG_ERROR("%d:%d) %s", i, entry->idx,
+ entry->debugging[entry->idx]);
+ }
}
}
#define LOG_OFFSET 2
#define RESYNC_HISTORY 50
-static char resync_history[RESYNC_HISTORY][128];
-static int idx = 0;
-#define LOG_SPRINT(f, arg...) do {\
- idx++; \
- idx = idx % RESYNC_HISTORY; \
- sprintf(resync_history[idx], f, ## arg); \
+//static char resync_history[RESYNC_HISTORY][128];
+//static int idx = 0;
+#define LOG_SPRINT(_lc, f, arg...) do { \
+ lc->idx++; \
+ lc->idx = lc->idx % RESYNC_HISTORY; \
+ sprintf(lc->resync_history[lc->idx], f, ## arg); \
} while (0)
struct log_header {
uint64_t disk_nr_regions;
size_t disk_size; /* size of disk_buffer in bytes */
void *disk_buffer; /* aligned memory for O_DIRECT */
+ int idx;
+ char resync_history[RESYNC_HISTORY][128];
};
struct mark_entry {
if (strlen(tfr->data) != tfr->data_size) {
LOG_ERROR("Received constructor request with bad data");
- LOG_ERROR("strlen(tfr->data)[%d] != tfr->data_size[%d]",
- (int)strlen(tfr->data), tfr->data_size);
+ LOG_ERROR("strlen(tfr->data)[%d] != tfr->data_size[%llu]",
+ (int)strlen(tfr->data),
+ (unsigned long long)tfr->data_size);
LOG_ERROR("tfr->data = '%s' [%d]",
tfr->data, (int)strlen(tfr->data));
return -EINVAL;
lc->sync_count = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
- LOG_SPRINT("[%s] Initial sync_count = %llu",
+ LOG_SPRINT(lc, "[%s] Initial sync_count = %llu",
SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count);
lc->sync_search = 0;
lc->state = LOG_RESUMED;
*/
static int clog_get_resync_work(struct clog_tfr *tfr)
{
- struct {int i; uint64_t r; } *pkg = (void *)tfr->data;
+ struct {
+ int32_t i;
+ uint32_t arch_padding;
+ uint64_t r;
+ } *pkg = (void *)tfr->data;
struct log_c *lc = get_log(tfr->uuid);
if (!lc)
* FIXME: handle intermittent errors during recovery
* by resetting sync_search... but not to many times.
*/
- LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"Recovery finished",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator);
return 0;
if (lc->recovering_region != (uint64_t)-1) {
if (lc->recoverer == tfr->originator) {
- LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"Re-requesting work (%llu)",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
(unsigned long long)lc->recovering_region);
pkg->i = 1;
LOG_COND(log_resend_requests, "***** RE-REQUEST *****");
} else {
- LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"Someone already recovering (%llu)",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
(unsigned long long)lc->recovering_region);
free(del);
if (!log_test_bit(lc->sync_bits, pkg->r)) {
- LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"Assigning priority resync work (%llu)",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
(unsigned long long)pkg->r);
lc->sync_search);
if (pkg->r >= lc->region_count) {
- LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"Resync work complete.",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator);
return 0;
lc->sync_search = pkg->r + 1;
- LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"Assigning resync work (%llu)",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
(unsigned long long)pkg->r);
*/
static int clog_set_region_sync(struct clog_tfr *tfr)
{
- struct { uint64_t region; int in_sync; } *pkg = (void *)tfr->data;
+ struct {
+ uint64_t region;
+ uint32_t arch_padding;
+ int32_t in_sync;
+ } *pkg = (void *)tfr->data;
struct log_c *lc = get_log(tfr->uuid);
if (!lc)
if (pkg->in_sync) {
if (log_test_bit(lc->sync_bits, pkg->region)) {
- LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"Region already set (%llu)",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
(unsigned long long)pkg->region);
lc->sync_count++;
/* The rest of this section is all for debugging */
- LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"Setting region (%llu)",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
(unsigned long long)pkg->region);
if (!log_test_bit(lc->sync_bits,
(pkg->region) ? pkg->region - 1 : 0)) {
- LOG_SPRINT("*** Previous bit not set ***");
+ LOG_SPRINT(lc, "*** Previous bit not set ***");
lc->skip_bit_warning = (pkg->region) ?
pkg->region - 1 : 0;
}
} else if (log_test_bit(lc->sync_bits, pkg->region)) {
lc->sync_count--;
log_clear_bit(lc, lc->sync_bits, pkg->region);
- LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"Unsetting region (%llu)",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
(unsigned long long)pkg->region);
if (lc->sync_count != count_bits32(lc->sync_bits, lc->bitset_uint32_count)) {
unsigned long long reset = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
- LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"sync_count(%llu) != bitmap count(%llu)",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
(unsigned long long)lc->sync_count, reset);
}
if (lc->sync_count > lc->region_count)
- LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
"(lc->sync_count > lc->region_count) - this is bad",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator);
if (lc->sync_count != count_bits32(lc->sync_bits, lc->bitset_uint32_count)) {
unsigned long long reset = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
- LOG_SPRINT("get_sync_count - SEQ#=%u, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "get_sync_count - SEQ#=%u, UUID=%s, nodeid = %u:: "
"sync_count(%llu) != bitmap count(%llu)",
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
(unsigned long long)lc->sync_count, reset);
static int clog_is_remote_recovering(struct clog_tfr *tfr)
{
uint64_t region = *((uint64_t *)(tfr->data));
- struct { int is_recovering; uint64_t in_sync_hint; } *pkg = (void *)tfr->data;
+ struct {
+ int32_t is_recovering;
+ uint32_t arch_padding;
+ uint64_t in_sync_hint;
+ } *pkg = (void *)tfr->data;
struct log_c *lc = get_log(tfr->uuid);
if (!lc)
/*
* Remember, 'lc->sync_search' is 1 plus the region
* currently being recovered. So, we must take off 1
- * to account for that.
+ * to account for that; but only if 'sync_search > 1'.
*/
- pkg->in_sync_hint = (lc->sync_search - 1);
+ pkg->in_sync_hint = lc->sync_search ? (lc->sync_search - 1) : 0;
LOG_DBG("[%s] Region is %s: %llu",
SHORT_UUID(lc->uuid),
(region == lc->recovering_region) ?
sprintf(*buf, "%llu %u", (unsigned long long)lc->recovering_region,
lc->recoverer);
- LOG_SPRINT("CKPT SEND - SEQ#=X, UUID=%s, nodeid = %u:: "
+ LOG_SPRINT(lc, "CKPT SEND - SEQ#=X, UUID=%s, nodeid = %u:: "
"recovering_region=%llu, recoverer=%u, sync_count=%llu",
SHORT_UUID(lc->uuid), debug_who,
(unsigned long long)lc->recovering_region,
if (!strncmp(which, "recovering_region", 17)) {
sscanf(buf, "%llu %u", (unsigned long long *)&lc->recovering_region,
&lc->recoverer);
- LOG_SPRINT("CKPT INIT - SEQ#=X, UUID=%s, nodeid = X:: "
+ LOG_SPRINT(lc, "CKPT INIT - SEQ#=X, UUID=%s, nodeid = X:: "
"recovering_region=%llu, recoverer=%u",
SHORT_UUID(lc->uuid),
(unsigned long long)lc->recovering_region, lc->recoverer);
LOG_ERROR("");
LOG_ERROR("LOG COMPONENT DEBUGGING::");
LOG_ERROR("Official log list:");
- __list_for_each(l, &log_list) {
+ LOG_ERROR("Pending log list:");
+ __list_for_each(l, &log_pending_list) {
lc = list_entry(l, struct log_c, list);
LOG_ERROR("%s", lc->uuid);
- LOG_ERROR(" recoverer : %u", lc->recoverer);
- LOG_ERROR(" recovering_region: %llu",
- (unsigned long long)lc->recovering_region);
- LOG_ERROR(" recovery_halted : %s", (lc->recovery_halted) ?
- "YES" : "NO");
LOG_ERROR("sync_bits:");
print_bits((char *)lc->sync_bits,
lc->bitset_uint32_count * sizeof(*lc->sync_bits), 1);
lc->bitset_uint32_count * sizeof(*lc->clean_bits), 1);
}
- LOG_ERROR("Pending log list:");
- __list_for_each(l, &log_pending_list) {
+ __list_for_each(l, &log_list) {
lc = list_entry(l, struct log_c, list);
LOG_ERROR("%s", lc->uuid);
+ LOG_ERROR(" recoverer : %u", lc->recoverer);
+ LOG_ERROR(" recovering_region: %llu",
+ (unsigned long long)lc->recovering_region);
+ LOG_ERROR(" recovery_halted : %s", (lc->recovery_halted) ?
+ "YES" : "NO");
LOG_ERROR("sync_bits:");
print_bits((char *)lc->sync_bits,
lc->bitset_uint32_count * sizeof(*lc->sync_bits), 1);
LOG_ERROR("clean_bits:");
print_bits((char *)lc->clean_bits,
lc->bitset_uint32_count * sizeof(*lc->clean_bits), 1);
- }
- __list_for_each(l, &log_list) {
lc = list_entry(l, struct log_c, list);
LOG_ERROR("Validating %s::", SHORT_UUID(lc->uuid));
r = find_next_zero_bit(lc->sync_bits, lc->region_count, 0);
LOG_ERROR("ADJUSTING SYNC_COUNT");
lc->sync_count = lc->region_count;
}
- }
- LOG_ERROR("Resync request history:");
- for (i = 0; i < RESYNC_HISTORY; i++) {
- idx++;
- idx = idx % RESYNC_HISTORY;
- if (resync_history[idx][0] == '\0')
- continue;
- LOG_ERROR("%d:%d) %s", i, idx, resync_history[idx]);
+ LOG_ERROR("Resync request history:");
+ for (i = 0; i < RESYNC_HISTORY; i++) {
+ lc->idx++;
+ lc->idx = lc->idx % RESYNC_HISTORY;
+ if (lc->resync_history[lc->idx][0] == '\0')
+ continue;
+ LOG_ERROR("%d:%d) %s", i, lc->idx,
+ lc->resync_history[lc->idx]);
+ }
}
}