From 78f321b32cb8c8832969d4468a06c34b6d0ea6ff Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 20 Jul 2010 15:32:07 +0000 Subject: [PATCH] [REGEX] add a fingerprinting facility to allow test code to compare dfas --- libdm/libdevmapper.h | 8 ++ libdm/regex/matcher.c | 123 ++++++++++++++++++++++++++++ unit-tests/regex/matcher_t.c | 1 + unit-tests/regex/matcher_t.expected | 1 + 4 files changed, 133 insertions(+) diff --git a/libdm/libdevmapper.h b/libdm/libdevmapper.h index 101ffb7da..655d66fe0 100644 --- a/libdm/libdevmapper.h +++ b/libdm/libdevmapper.h @@ -1007,6 +1007,14 @@ struct dm_regex *dm_regex_create(struct dm_pool *mem, const char **patterns, */ int dm_regex_match(struct dm_regex *regex, const char *s); +/* + * This is useful for regression testing only. The idea is if two + * fingerprints are different, then the two dfas are certainly not + * isomorphic. If two fingerprints _are_ the same then it's very likely + * that the dfas are isomorphic. + */ +uint32_t dm_regex_fingerprint(struct dm_regex *regex); + /********************* * reporting functions *********************/ diff --git a/libdm/regex/matcher.c b/libdm/regex/matcher.c index 350a35303..d8d856c84 100644 --- a/libdm/regex/matcher.c +++ b/libdm/regex/matcher.c @@ -363,3 +363,126 @@ int dm_regex_match(struct dm_regex *regex, const char *s) /* subtract 1 to get back to zero index */ return r - 1; } + +/* + * The next block of code concerns calculating a fingerprint for the dfa. + * + * We're not calculating a minimal dfa in _calculate_state (maybe a future + * improvement). As such it's possible that two non-isomorphic dfas + * recognise the same language. This can only really happen if you start + * with equivalent, but different regexes (for example the simplifier in + * parse_rx.c may have changed). + * + * The code is inefficient; repeatedly searching a singly linked list for + * previously seen nodes. Not worried since this is test code. + */ +struct node_list { + unsigned node_id; + struct dfa_state *node; + struct node_list *next; +}; + +struct printer { + struct dm_pool *mem; + struct node_list *pending; + struct node_list *processed; + unsigned next_index; +}; + +static uint32_t randomise_(uint32_t n) +{ + /* 2^32 - 5 */ + uint32_t const prime = (~0) - 4; + return n * prime; +} + +static int seen_(struct node_list *n, struct dfa_state *node, uint32_t *i) +{ + while (n) { + if (n->node == node) { + *i = n->node_id; + return 1; + } + n = n->next; + } + + return 0; +} + +/* + * Push node if it's not been seen before, returning a unique index. + */ +static uint32_t push_node_(struct printer *p, struct dfa_state *node) +{ + uint32_t i; + if (seen_(p->pending, node, &i) || + seen_(p->processed, node, &i)) + return i; + else { + struct node_list *n = dm_pool_alloc(p->mem, sizeof(*n)); + assert(n); + n->node_id = p->next_index++; + n->node = node; + n->next = p->pending; + p->pending = n; + return n->node_id; + } +} + +/* + * Pop the front node, and fill out it's previously assigned index. + */ +static struct dfa_state *pop_node_(struct printer *p) +{ + struct dfa_state *node = NULL; + + if (p->pending) { + struct node_list *n = p->pending; + p->pending = n->next; + n->next = p->processed; + p->processed = n; + + node = n->node; + } + + return node; +} + +static uint32_t combine_(uint32_t n1, uint32_t n2) +{ + return ((n1 << 8) | (n1 >> 24)) ^ randomise_(n2); +} + +static uint32_t fingerprint_(struct printer *p) +{ + int c; + uint32_t result = 0; + struct dfa_state *node; + + while ((node = pop_node_(p))) { + result = combine_(result, node->final); + for (c = 0; c < 256; c++) + result = combine_(result, + push_node_(p, node->lookup[c])); + } + + return result; +} + +uint32_t dm_regex_fingerprint(struct dm_regex *regex) +{ + uint32_t result; + struct printer p; + struct dm_pool *mem = dm_pool_create("regex fingerprint", 1024); + + assert(mem); + p.mem = mem; + p.pending = NULL; + p.processed = NULL; + p.next_index = 0; + + push_node_(&p, regex->start); + result = fingerprint_(&p); + dm_pool_destroy(mem); + return result; +} diff --git a/unit-tests/regex/matcher_t.c b/unit-tests/regex/matcher_t.c index 1eb1b3c48..063e864db 100644 --- a/unit-tests/regex/matcher_t.c +++ b/unit-tests/regex/matcher_t.c @@ -135,6 +135,7 @@ int main(int argc, char **argv) goto err; } + printf("fingerprint: %x\n", dm_regex_fingerprint(scanner)); _scan_input(scanner, regex); _free_regex(regex, nregex); diff --git a/unit-tests/regex/matcher_t.expected b/unit-tests/regex/matcher_t.expected index aaf071c57..5ace51c4b 100644 --- a/unit-tests/regex/matcher_t.expected +++ b/unit-tests/regex/matcher_t.expected @@ -1,4 +1,5 @@ Matcher built with 23 dfa states +fingerprint: 352b6c4f /dev/loop/0 : loop/[0-9]+ /dev/loop/1 : loop/[0-9]+ /dev/loop/2 : loop/[0-9]+ -- 2.43.5